import time
import urllib
import requests
from bs4 import BeautifulSoup

start_url = "https://en.wikipedia.org/wiki/Special:Random"
target_url = "https://en.wikipedia.org/wiki/Philosophy"

def continue_crawl(search_history, target_url, max_steps = 25):
  current_url = search_history[-1]
  if current_url == target_url:
    print("We've found the target article!")
    return False
  if len(search_history) > max_steps:
    print("The search has gone on suspiciously long, aborting search!")
    return False
  if current_url in search_history[:-1]:
    print("We've arrived at an article we've already seen, aborting search!")
    print("The article was %s" % current_url)
    return False
  return True

def find_first_link(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  article_link = None

  content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output")

  for element in content_div.find_all('p', recursive=False):
    link = element.find('a', recursive=False)
    if link:
      article_link = link.get('href')
      break

  if not article_link:
    return
  
  first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link)
  return first_link

article_chain = [start_url]

while continue_crawl(article_chain, target_url):
  print(article_chain[-1])
  # download html of last article in article_chain
  # find the first link in that html
  first_link = find_first_link(article_chain[-1])
  # add the first link to article_chain
  article_chain.append(first_link)
  # delay for about two seconds
  time.sleep(2)