Skip to content

Instantly share code, notes, and snippets.

@pmallory
Last active May 10, 2022 00:38
Show Gist options
  • Select an option

  • Save pmallory/66c129389e3abca3dbdbf42ba7e12ad6 to your computer and use it in GitHub Desktop.

Select an option

Save pmallory/66c129389e3abca3dbdbf42ba7e12ad6 to your computer and use it in GitHub Desktop.

Revisions

  1. pmallory revised this gist Dec 9, 2016. 1 changed file with 5 additions and 3 deletions.
    8 changes: 5 additions & 3 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -4,6 +4,10 @@
    import bs4
    import requests


    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"

    def find_first_link(url):
    response = requests.get(url)
    html = response.text
    @@ -16,7 +20,7 @@ def find_first_link(url):
    # links this value will remain None
    article_link = None

    # Find all the direct childern of content_div that are paragraphs
    # Find all the direct children of content_div that are paragraphs
    for element in content_div.find_all("p", recursive=False):
    # Find the first anchor tag that's a direct child of a paragraph.
    # It's important to only look at direct children, because other types
    @@ -48,8 +52,6 @@ def continue_crawl(search_history, target_url, max_steps=25):
    else:
    return True

    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"
    article_chain = [start_url]

    while continue_crawl(article_chain, target_url):
  2. pmallory revised this gist Nov 30, 2016. 1 changed file with 15 additions and 17 deletions.
    32 changes: 15 additions & 17 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -20,15 +20,15 @@ def find_first_link(url):
    for element in content_div.find_all("p", recursive=False):
    # Find the first anchor tag that's a direct child of a paragraph.
    # It's important to only look at direct children, because other types
    # of link, e.g. footnotes and pronunciation, could come before the first
    # link to an article. Those other link types aren't direct children though,
    # they're in divs of various classes.
    # of link, e.g. footnotes and pronunciation, could come before the
    # first link to an article. Those other link types aren't direct
    # children though, they're in divs of various classes.
    if element.find("a", recursive=False):
    article_link = element.find("a", recursive=False).get('href')
    break

    if not article_link:
    return
    return

    # Build a full url from the relative article_link url
    first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
    @@ -48,20 +48,18 @@ def continue_crawl(search_history, target_url, max_steps=25):
    else:
    return True

    if __name__ == "__main__":
    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    start_url = "https://en.wikipedia.org/wiki/Benazir_Shaikh"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"
    article_chain = [start_url]
    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"
    article_chain = [start_url]

    while continue_crawl(article_chain, target_url):
    print(article_chain[-1])
    while continue_crawl(article_chain, target_url):
    print(article_chain[-1])

    first_link = find_first_link(article_chain[-1])
    if not first_link:
    print("We've arrived at an article with no links, aborting search!")
    break
    first_link = find_first_link(article_chain[-1])
    if not first_link:
    print("We've arrived at an article with no links, aborting search!")
    break

    article_chain.append(first_link)
    article_chain.append(first_link)

    time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
    time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
  3. pmallory revised this gist Nov 28, 2016. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -35,8 +35,8 @@ def find_first_link(url):

    return first_link

    def continue_crawl(search_history, target, max_steps=25):
    if search_history[-1] == target:
    def continue_crawl(search_history, target_url, max_steps=25):
    if search_history[-1] == target_url:
    print("We've found the target article!")
    return False
    elif len(search_history) > max_steps:
  4. pmallory revised this gist Nov 22, 2016. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -35,11 +35,11 @@ def find_first_link(url):

    return first_link

    def search_should_continue(search_history, target):
    def continue_crawl(search_history, target, max_steps=25):
    if search_history[-1] == target:
    print("We've found the target article!")
    return False
    elif len(search_history) > 25:
    elif len(search_history) > max_steps:
    print("The search has gone on suspiciously long, aborting search!")
    return False
    elif search_history[-1] in search_history[:-1]:
    @@ -54,7 +54,7 @@ def search_should_continue(search_history, target):
    target_url = "https://en.wikipedia.org/wiki/Philosophy"
    article_chain = [start_url]

    while search_should_continue(article_chain, target_url):
    while continue_crawl(article_chain, target_url):
    print(article_chain[-1])

    first_link = find_first_link(article_chain[-1])
  5. pmallory revised this gist Nov 15, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -6,7 +6,7 @@

    def find_first_link(url):
    response = requests.get(url)
    html = response.content
    html = response.text
    soup = bs4.BeautifulSoup(html, "html.parser")

    # This div contains the article's body
  6. pmallory revised this gist Nov 15, 2016. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -28,7 +28,7 @@ def find_first_link(url):
    break

    if not article_link:
    return
    return

    # Build a full url from the relative article_link url
    first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
    @@ -55,13 +55,13 @@ def search_should_continue(search_history, target):
    article_chain = [start_url]

    while search_should_continue(article_chain, target_url):
    print(article_chain[-1])

    first_link = find_first_link(article_chain[-1])
    if not first_link:
    print("We've arrived at an article with no links, aborting search!")
    break

    print(first_link)

    article_chain.append(first_link)

    time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
  7. pmallory revised this gist Nov 15, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -28,7 +28,7 @@ def find_first_link(url):
    break

    if not article_link:
    return None
    return

    # Build a full url from the relative article_link url
    first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
  8. pmallory revised this gist Nov 15, 2016. 1 changed file with 11 additions and 0 deletions.
    11 changes: 11 additions & 0 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -12,6 +12,10 @@ def find_first_link(url):
    # This div contains the article's body
    content_div = soup.find(id="mw-content-text")

    # stores the first link found in the article, if the article contains no
    # links this value will remain None
    article_link = None

    # Find all the direct childern of content_div that are paragraphs
    for element in content_div.find_all("p", recursive=False):
    # Find the first anchor tag that's a direct child of a paragraph.
    @@ -23,6 +27,9 @@ def find_first_link(url):
    article_link = element.find("a", recursive=False).get('href')
    break

    if not article_link:
    return None

    # Build a full url from the relative article_link url
    first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)

    @@ -43,11 +50,15 @@ def search_should_continue(search_history, target):

    if __name__ == "__main__":
    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    start_url = "https://en.wikipedia.org/wiki/Benazir_Shaikh"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"
    article_chain = [start_url]

    while search_should_continue(article_chain, target_url):
    first_link = find_first_link(article_chain[-1])
    if not first_link:
    print("We've arrived at an article with no links, aborting search!")
    break

    print(first_link)

  9. pmallory revised this gist Nov 15, 2016. 1 changed file with 14 additions and 5 deletions.
    19 changes: 14 additions & 5 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -28,20 +28,29 @@ def find_first_link(url):

    return first_link

    def search_should_continue(search_history, target):
    if search_history[-1] == target:
    print("We've found the target article!")
    return False
    elif len(search_history) > 25:
    print("The search has gone on suspiciously long, aborting search!")
    return False
    elif search_history[-1] in search_history[:-1]:
    print("We've arrived at an article we've already seen, aborting search!")
    return False
    else:
    return True

    if __name__ == "__main__":
    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"
    article_chain = [start_url]

    while article_chain[-1] != target_url and len(article_chain) < 25:
    while search_should_continue(article_chain, target_url):
    first_link = find_first_link(article_chain[-1])

    print(first_link)

    if first_link in article_chain:
    print("Cycle detected, aborting search")
    break

    article_chain.append(first_link)

    time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers
  10. pmallory revised this gist Nov 9, 2016. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -7,7 +7,7 @@
    def find_first_link(url):
    response = requests.get(url)
    html = response.content
    soup = bs4.BeautifulSoup(html, "html.parser")O
    soup = bs4.BeautifulSoup(html, "html.parser")

    # This div contains the article's body
    content_div = soup.find(id="mw-content-text")
  11. pmallory created this gist Nov 9, 2016.
    47 changes: 47 additions & 0 deletions WikipediaCrawl.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,47 @@
    import time
    import urllib

    import bs4
    import requests

    def find_first_link(url):
    response = requests.get(url)
    html = response.content
    soup = bs4.BeautifulSoup(html, "html.parser")O

    # This div contains the article's body
    content_div = soup.find(id="mw-content-text")

    # Find all the direct childern of content_div that are paragraphs
    for element in content_div.find_all("p", recursive=False):
    # Find the first anchor tag that's a direct child of a paragraph.
    # It's important to only look at direct children, because other types
    # of link, e.g. footnotes and pronunciation, could come before the first
    # link to an article. Those other link types aren't direct children though,
    # they're in divs of various classes.
    if element.find("a", recursive=False):
    article_link = element.find("a", recursive=False).get('href')
    break

    # Build a full url from the relative article_link url
    first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)

    return first_link

    if __name__ == "__main__":
    start_url = "https://en.wikipedia.org/wiki/Special:Random"
    target_url = "https://en.wikipedia.org/wiki/Philosophy"
    article_chain = [start_url]

    while article_chain[-1] != target_url and len(article_chain) < 25:
    first_link = find_first_link(article_chain[-1])

    print(first_link)

    if first_link in article_chain:
    print("Cycle detected, aborting search")
    break

    article_chain.append(first_link)

    time.sleep(2) # Slow things down so as to not hammer Wikipedia's servers