Last active
May 10, 2022 00:38
-
-
Save pmallory/66c129389e3abca3dbdbf42ba7e12ad6 to your computer and use it in GitHub Desktop.
Revisions
-
pmallory revised this gist
Dec 9, 2016 . 1 changed file with 5 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,6 +4,10 @@ import bs4 import requests start_url = "https://en.wikipedia.org/wiki/Special:Random" target_url = "https://en.wikipedia.org/wiki/Philosophy" def find_first_link(url): response = requests.get(url) html = response.text @@ -16,7 +20,7 @@ def find_first_link(url): # links this value will remain None article_link = None # Find all the direct children of content_div that are paragraphs for element in content_div.find_all("p", recursive=False): # Find the first anchor tag that's a direct child of a paragraph. # It's important to only look at direct children, because other types @@ -48,8 +52,6 @@ def continue_crawl(search_history, target_url, max_steps=25): else: return True article_chain = [start_url] while continue_crawl(article_chain, target_url): -
pmallory revised this gist
Nov 30, 2016 . 1 changed file with 15 additions and 17 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -20,15 +20,15 @@ def find_first_link(url): for element in content_div.find_all("p", recursive=False): # Find the first anchor tag that's a direct child of a paragraph. # It's important to only look at direct children, because other types # of link, e.g. footnotes and pronunciation, could come before the # first link to an article. Those other link types aren't direct # children though, they're in divs of various classes. if element.find("a", recursive=False): article_link = element.find("a", recursive=False).get('href') break if not article_link: return # Build a full url from the relative article_link url first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) @@ -48,20 +48,18 @@ def continue_crawl(search_history, target_url, max_steps=25): else: return True start_url = "https://en.wikipedia.org/wiki/Special:Random" target_url = "https://en.wikipedia.org/wiki/Philosophy" article_chain = [start_url] while continue_crawl(article_chain, target_url): print(article_chain[-1]) first_link = find_first_link(article_chain[-1]) if not first_link: print("We've arrived at an article with no links, aborting search!") break article_chain.append(first_link) time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers -
pmallory revised this gist
Nov 28, 2016 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -35,8 +35,8 @@ def find_first_link(url): return first_link def continue_crawl(search_history, target_url, max_steps=25): if search_history[-1] == target_url: print("We've found the target article!") return False elif len(search_history) > max_steps: -
pmallory revised this gist
Nov 22, 2016 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -35,11 +35,11 @@ def find_first_link(url): return first_link def continue_crawl(search_history, target, max_steps=25): if search_history[-1] == target: print("We've found the target article!") return False elif len(search_history) > max_steps: print("The search has gone on suspiciously long, aborting search!") return False elif search_history[-1] in search_history[:-1]: @@ -54,7 +54,7 @@ def search_should_continue(search_history, target): target_url = "https://en.wikipedia.org/wiki/Philosophy" article_chain = [start_url] while continue_crawl(article_chain, target_url): print(article_chain[-1]) first_link = find_first_link(article_chain[-1]) -
pmallory revised this gist
Nov 15, 2016 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,7 +6,7 @@ def find_first_link(url): response = requests.get(url) html = response.text soup = bs4.BeautifulSoup(html, "html.parser") # This div contains the article's body -
pmallory revised this gist
Nov 15, 2016 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -28,7 +28,7 @@ def find_first_link(url): break if not article_link: return # Build a full url from the relative article_link url first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) @@ -55,13 +55,13 @@ def search_should_continue(search_history, target): article_chain = [start_url] while search_should_continue(article_chain, target_url): print(article_chain[-1]) first_link = find_first_link(article_chain[-1]) if not first_link: print("We've arrived at an article with no links, aborting search!") break article_chain.append(first_link) time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers -
pmallory revised this gist
Nov 15, 2016 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -28,7 +28,7 @@ def find_first_link(url): break if not article_link: return # Build a full url from the relative article_link url first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) -
pmallory revised this gist
Nov 15, 2016 . 1 changed file with 11 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,6 +12,10 @@ def find_first_link(url): # This div contains the article's body content_div = soup.find(id="mw-content-text") # stores the first link found in the article, if the article contains no # links this value will remain None article_link = None # Find all the direct childern of content_div that are paragraphs for element in content_div.find_all("p", recursive=False): # Find the first anchor tag that's a direct child of a paragraph. @@ -23,6 +27,9 @@ def find_first_link(url): article_link = element.find("a", recursive=False).get('href') break if not article_link: return None # Build a full url from the relative article_link url first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) @@ -43,11 +50,15 @@ def search_should_continue(search_history, target): if __name__ == "__main__": start_url = "https://en.wikipedia.org/wiki/Special:Random" start_url = "https://en.wikipedia.org/wiki/Benazir_Shaikh" target_url = "https://en.wikipedia.org/wiki/Philosophy" article_chain = [start_url] while search_should_continue(article_chain, target_url): first_link = find_first_link(article_chain[-1]) if not first_link: print("We've arrived at an article with no links, aborting search!") break print(first_link) -
pmallory revised this gist
Nov 15, 2016 . 1 changed file with 14 additions and 5 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -28,20 +28,29 @@ def find_first_link(url): return first_link def search_should_continue(search_history, target): if search_history[-1] == target: print("We've found the target article!") return False elif len(search_history) > 25: print("The search has gone on suspiciously long, aborting search!") return False elif search_history[-1] in search_history[:-1]: print("We've arrived at an article we've already seen, aborting search!") return False else: return True if __name__ == "__main__": start_url = "https://en.wikipedia.org/wiki/Special:Random" target_url = "https://en.wikipedia.org/wiki/Philosophy" article_chain = [start_url] while search_should_continue(article_chain, target_url): first_link = find_first_link(article_chain[-1]) print(first_link) article_chain.append(first_link) time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers -
pmallory revised this gist
Nov 9, 2016 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,7 +7,7 @@ def find_first_link(url): response = requests.get(url) html = response.content soup = bs4.BeautifulSoup(html, "html.parser") # This div contains the article's body content_div = soup.find(id="mw-content-text") -
pmallory created this gist
Nov 9, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,47 @@ import time import urllib import bs4 import requests def find_first_link(url): response = requests.get(url) html = response.content soup = bs4.BeautifulSoup(html, "html.parser")O # This div contains the article's body content_div = soup.find(id="mw-content-text") # Find all the direct childern of content_div that are paragraphs for element in content_div.find_all("p", recursive=False): # Find the first anchor tag that's a direct child of a paragraph. # It's important to only look at direct children, because other types # of link, e.g. footnotes and pronunciation, could come before the first # link to an article. Those other link types aren't direct children though, # they're in divs of various classes. if element.find("a", recursive=False): article_link = element.find("a", recursive=False).get('href') break # Build a full url from the relative article_link url first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) return first_link if __name__ == "__main__": start_url = "https://en.wikipedia.org/wiki/Special:Random" target_url = "https://en.wikipedia.org/wiki/Philosophy" article_chain = [start_url] while article_chain[-1] != target_url and len(article_chain) < 25: first_link = find_first_link(article_chain[-1]) print(first_link) if first_link in article_chain: print("Cycle detected, aborting search") break article_chain.append(first_link) time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers