import time import urllib import requests from bs4 import BeautifulSoup start_url = "https://en.wikipedia.org/wiki/Special:Random" target_url = "https://en.wikipedia.org/wiki/Philosophy" def continue_crawl(search_history, target_url, max_steps = 25): current_url = search_history[-1] if current_url == target_url: print("We've found the target article!") return False if len(search_history) > max_steps: print("The search has gone on suspiciously long, aborting search!") return False if current_url in search_history[:-1]: print("We've arrived at an article we've already seen, aborting search!") print("The article was %s" % current_url) return False return True def find_first_link(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') article_link = None content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output") for element in content_div.find_all('p', recursive=False): link = element.find('a', recursive=False) if link: article_link = link.get('href') break if not article_link: return first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) return first_link article_chain = [start_url] while continue_crawl(article_chain, target_url): print(article_chain[-1]) # download html of last article in article_chain # find the first link in that html first_link = find_first_link(article_chain[-1]) # add the first link to article_chain article_chain.append(first_link) # delay for about two seconds time.sleep(2)