warborn · March 7, 2018 22:04 · Mar 7, 2018
diff --git a/wikipedia_webcrawler.py b/wikipedia_webcrawler.py
@@ -0,0 +1,53 @@
+import time
+import urllib
+import requests
+from bs4 import BeautifulSoup
+
+start_url = "https://en.wikipedia.org/wiki/Special:Random"
+target_url = "https://en.wikipedia.org/wiki/Philosophy"
+
+def continue_crawl(search_history, target_url, max_steps = 25):
+  current_url = search_history[-1]
+  if current_url == target_url:
+    print("We've found the target article!")
+    return False
+  if len(search_history) > max_steps:
+    print("The search has gone on suspiciously long, aborting search!")
+    return False
+  if current_url in search_history[:-1]:
+    print("We've arrived at an article we've already seen, aborting search!")
+    print("The article was %s" % current_url)
+    return False
+  return True
+
+def find_first_link(url):
+  response = requests.get(url)
+  soup = BeautifulSoup(response.text, 'html.parser')
+
+  article_link = None
+
+  content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output")
+
+  for element in content_div.find_all('p', recursive=False):
+    link = element.find('a', recursive=False)
+    if link:
+      article_link = link.get('href')
+      break
+
+  if not article_link:
+    return
+
+  first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
+  return first_link
+
+article_chain = [start_url]
+
+while continue_crawl(article_chain, target_url):
+  print(article_chain[-1])
+  # download html of last article in article_chain
+  # find the first link in that html
+  first_link = find_first_link(article_chain[-1])
+  # add the first link to article_chain
+  article_chain.append(first_link)
+  # delay for about two seconds
+  time.sleep(2)
No results found