pmallory · May 10, 2022 00:38 · Dec 9, 2016 · Nov 30, 2016 · Nov 28, 2016 · Nov 22, 2016
diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -4,6 +4,10 @@
 import bs4
 import requests
 
+
+start_url = "https://en.wikipedia.org/wiki/Special:Random"
+target_url = "https://en.wikipedia.org/wiki/Philosophy"
+
 def find_first_link(url):
     response = requests.get(url)
     html = response.text
@@ -16,7 +20,7 @@ def find_first_link(url):
     # links this value will remain None
     article_link = None
 
-    # Find all the direct childern of content_div that are paragraphs
+    # Find all the direct children of content_div that are paragraphs
     for element in content_div.find_all("p", recursive=False):
         # Find the first anchor tag that's a direct child of a paragraph.
         # It's important to only look at direct children, because other types
@@ -48,8 +52,6 @@ def continue_crawl(search_history, target_url, max_steps=25):
     else:
         return True
 
-start_url = "https://en.wikipedia.org/wiki/Special:Random"
-target_url = "https://en.wikipedia.org/wiki/Philosophy"
 article_chain = [start_url]
 
 while continue_crawl(article_chain, target_url):

diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -20,15 +20,15 @@ def find_first_link(url):
     for element in content_div.find_all("p", recursive=False):
         # Find the first anchor tag that's a direct child of a paragraph.
         # It's important to only look at direct children, because other types
-        # of link, e.g. footnotes and pronunciation, could come before the first
-        # link to an article. Those other link types aren't direct children though,
-        # they're in divs of various classes.
+        # of link, e.g. footnotes and pronunciation, could come before the
+        # first link to an article. Those other link types aren't direct
+        # children though, they're in divs of various classes.
         if element.find("a", recursive=False):
             article_link = element.find("a", recursive=False).get('href')
             break
 
     if not article_link:
-        return 
+        return
 
     # Build a full url from the relative article_link url
     first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
@@ -48,20 +48,18 @@ def continue_crawl(search_history, target_url, max_steps=25):
     else:
         return True
 
-if __name__ == "__main__":
-    start_url = "https://en.wikipedia.org/wiki/Special:Random"
-    start_url = "https://en.wikipedia.org/wiki/Benazir_Shaikh"
-    target_url = "https://en.wikipedia.org/wiki/Philosophy"
-    article_chain = [start_url]
+start_url = "https://en.wikipedia.org/wiki/Special:Random"
+target_url = "https://en.wikipedia.org/wiki/Philosophy"
+article_chain = [start_url]
 
-    while continue_crawl(article_chain, target_url):
-        print(article_chain[-1])
+while continue_crawl(article_chain, target_url):
+    print(article_chain[-1])
 
-        first_link = find_first_link(article_chain[-1])
-        if not first_link:
-            print("We've arrived at an article with no links, aborting search!")
-            break
+    first_link = find_first_link(article_chain[-1])
+    if not first_link:
+        print("We've arrived at an article with no links, aborting search!")
+        break
 
-        article_chain.append(first_link)
+    article_chain.append(first_link)
 
-        time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
+    time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -35,8 +35,8 @@ def find_first_link(url):
 
     return first_link
 
-def continue_crawl(search_history, target, max_steps=25):
-    if search_history[-1] == target:
+def continue_crawl(search_history, target_url, max_steps=25):
+    if search_history[-1] == target_url:
         print("We've found the target article!")
         return False
     elif len(search_history) > max_steps:

diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -35,11 +35,11 @@ def find_first_link(url):
 
     return first_link
 
-def search_should_continue(search_history, target):
+def continue_crawl(search_history, target, max_steps=25):
     if search_history[-1] == target:
         print("We've found the target article!")
         return False
-    elif len(search_history) > 25:
+    elif len(search_history) > max_steps:
         print("The search has gone on suspiciously long, aborting search!")
         return False
     elif search_history[-1] in search_history[:-1]:
@@ -54,7 +54,7 @@ def search_should_continue(search_history, target):
     target_url = "https://en.wikipedia.org/wiki/Philosophy"
     article_chain = [start_url]
 
-    while search_should_continue(article_chain, target_url):
+    while continue_crawl(article_chain, target_url):
         print(article_chain[-1])
 
         first_link = find_first_link(article_chain[-1])

diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -6,7 +6,7 @@
 
 def find_first_link(url):
     response = requests.get(url)
-    html = response.content
+    html = response.text
     soup = bs4.BeautifulSoup(html, "html.parser")
 
     # This div contains the article's body

diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -28,7 +28,7 @@ def find_first_link(url):
             break
 
     if not article_link:
-        return
+        return 
 
     # Build a full url from the relative article_link url
     first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
@@ -55,13 +55,13 @@ def search_should_continue(search_history, target):
     article_chain = [start_url]
 
     while search_should_continue(article_chain, target_url):
+        print(article_chain[-1])
+
         first_link = find_first_link(article_chain[-1])
         if not first_link:
             print("We've arrived at an article with no links, aborting search!")
             break
 
-        print(first_link)
-
         article_chain.append(first_link)
 
         time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -28,7 +28,7 @@ def find_first_link(url):
             break
 
     if not article_link:
-        return None
+        return
 
     # Build a full url from the relative article_link url
     first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)

diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -12,6 +12,10 @@ def find_first_link(url):
     # This div contains the article's body
     content_div = soup.find(id="mw-content-text")
 
+    # stores the first link found in the article, if the article contains no
+    # links this value will remain None
+    article_link = None
+
     # Find all the direct childern of content_div that are paragraphs
     for element in content_div.find_all("p", recursive=False):
         # Find the first anchor tag that's a direct child of a paragraph.
@@ -23,6 +27,9 @@ def find_first_link(url):
             article_link = element.find("a", recursive=False).get('href')
             break
 
+    if not article_link:
+        return None
+
     # Build a full url from the relative article_link url
     first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
 
@@ -43,11 +50,15 @@ def search_should_continue(search_history, target):
 
 if __name__ == "__main__":
     start_url = "https://en.wikipedia.org/wiki/Special:Random"
+    start_url = "https://en.wikipedia.org/wiki/Benazir_Shaikh"
     target_url = "https://en.wikipedia.org/wiki/Philosophy"
     article_chain = [start_url]
 
     while search_should_continue(article_chain, target_url):
         first_link = find_first_link(article_chain[-1])
+        if not first_link:
+            print("We've arrived at an article with no links, aborting search!")
+            break
 
         print(first_link)
 

diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -28,20 +28,29 @@ def find_first_link(url):
 
     return first_link
 
+def search_should_continue(search_history, target):
+    if search_history[-1] == target:
+        print("We've found the target article!")
+        return False
+    elif len(search_history) > 25:
+        print("The search has gone on suspiciously long, aborting search!")
+        return False
+    elif search_history[-1] in search_history[:-1]:
+        print("We've arrived at an article we've already seen, aborting search!")
+        return False
+    else:
+        return True
+
 if __name__ == "__main__":
     start_url = "https://en.wikipedia.org/wiki/Special:Random"
     target_url = "https://en.wikipedia.org/wiki/Philosophy"
     article_chain = [start_url]
 
-    while article_chain[-1] != target_url and len(article_chain) < 25:
+    while search_should_continue(article_chain, target_url):
         first_link = find_first_link(article_chain[-1])
 
         print(first_link)
 
-        if first_link in article_chain:
-            print("Cycle detected, aborting search")
-            break
-
         article_chain.append(first_link)
 
         time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -7,7 +7,7 @@
 def find_first_link(url):
     response = requests.get(url)
     html = response.content
-    soup = bs4.BeautifulSoup(html, "html.parser")O
+    soup = bs4.BeautifulSoup(html, "html.parser")
 
     # This div contains the article's body
     content_div = soup.find(id="mw-content-text")

diff --git a/WikipediaCrawl.py b/WikipediaCrawl.py
@@ -0,0 +1,47 @@
+import time
+import urllib
+
+import bs4
+import requests
+
+def find_first_link(url):
+    response = requests.get(url)
+    html = response.content
+    soup = bs4.BeautifulSoup(html, "html.parser")O
+
+    # This div contains the article's body
+    content_div = soup.find(id="mw-content-text")
+
+    # Find all the direct childern of content_div that are paragraphs
+    for element in content_div.find_all("p", recursive=False):
+        # Find the first anchor tag that's a direct child of a paragraph.
+        # It's important to only look at direct children, because other types
+        # of link, e.g. footnotes and pronunciation, could come before the first
+        # link to an article. Those other link types aren't direct children though,
+        # they're in divs of various classes.
+        if element.find("a", recursive=False):
+            article_link = element.find("a", recursive=False).get('href')
+            break
+
+    # Build a full url from the relative article_link url
+    first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
+
+    return first_link
+
+if __name__ == "__main__":
+    start_url = "https://en.wikipedia.org/wiki/Special:Random"
+    target_url = "https://en.wikipedia.org/wiki/Philosophy"
+    article_chain = [start_url]
+
+    while article_chain[-1] != target_url and len(article_chain) < 25:
+        first_link = find_first_link(article_chain[-1])
+
+        print(first_link)
+
+        if first_link in article_chain:
+            print("Cycle detected, aborting search")
+            break
+
+        article_chain.append(first_link)
+
+        time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
No results found