Created
March 7, 2018 22:04
-
-
Save warborn/be67af3438e328c30c233583bd3ff237 to your computer and use it in GitHub Desktop.
Revisions
-
warborn created this gist
Mar 7, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,53 @@ import time import urllib import requests from bs4 import BeautifulSoup start_url = "https://en.wikipedia.org/wiki/Special:Random" target_url = "https://en.wikipedia.org/wiki/Philosophy" def continue_crawl(search_history, target_url, max_steps = 25): current_url = search_history[-1] if current_url == target_url: print("We've found the target article!") return False if len(search_history) > max_steps: print("The search has gone on suspiciously long, aborting search!") return False if current_url in search_history[:-1]: print("We've arrived at an article we've already seen, aborting search!") print("The article was %s" % current_url) return False return True def find_first_link(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') article_link = None content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output") for element in content_div.find_all('p', recursive=False): link = element.find('a', recursive=False) if link: article_link = link.get('href') break if not article_link: return first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) return first_link article_chain = [start_url] while continue_crawl(article_chain, target_url): print(article_chain[-1]) # download html of last article in article_chain # find the first link in that html first_link = find_first_link(article_chain[-1]) # add the first link to article_chain article_chain.append(first_link) # delay for about two seconds time.sleep(2)