# WikiStance. A Wikipedia distance meter. # Based on the alt text of http://xkcd.org/903/ # Gets a Wikipedia URL and measure the distance of this page to the Philosophy article, clicking on links not in parens # neither italics # # Author:: Alejandro Fernández (mailto:antarticonorte@gmail.com) # Copyright:: Copyright (c) 2011 Alejandro Fernández # License:: GPL # # =Usage= # # require 'wikistance' # # url = 'http://en.wikipedia.org/wiki/Scrubs_%28TV_series%29' # ws = WikiStance.new(url) # ws.trace # Go through all the pages until we reach Philosophy # ws.distance # => 22 # ws.breadcrumbs # => ["List of characters on Scrubs", "NBC", "United States", ..., "Philosophy"] require 'rubygems' require 'mechanize' class WikiStance attr_reader :title, :breadcrumbs def initialize(url) if url =~ /^http:\/\/en\.wikipedia\.org\/wiki\/(.*?)/ @url = url # Wikipedia returns 403 with the default user agent @agent = Mechanize.new @agent.user_agent_alias = 'Mac Safari' self.reset else raise ArgumentError, "You should use a valid wikipedia link" end end # Resets the class def reset @page = @agent.get(@url) @breadcrumbs = [] @title = page_title @breadcrumbs << @title true end # Gets the current @page title def page_title @page.at('#firstHeading').text() end # Go through the pages to calculate distance def trace while page_title != 'Philosophy' click_first_link title = page_title # Avoid entering in an infinite loop if @breadcrumbs.include?(title) raise "We are repeating ourselves! We already visited \"#{title}\"" end @breadcrumbs << title end true end def distance # Breadcrumbs hold the initial page. If we start in philosophy the distance should be 0 @breadcrumbs.length - 1 end private def click_first_link first_link = nil # div#bodyContent is where wikipedia shows article's content # The starting text is direct child of div#bodyContent. This way we avoid

inside TOCs and other texts. # We also avoid Disambiguation and other wikipedia texts, (which all of them contains links in italics) because # they are in

instead of

@page.search('#bodyContent > p').each do |p| # Links between parens should not be clicked # I tried using a regex with lookbehind to know if a link has an opening parenthesis before, but ruby doesn't # support them, so I will just remove all text between parens... text = p.to_html.gsub(/\((?:.*?)\)/, '').gsub(/(?:.*?)<\/i>/, '') # ...and then get the first link. first_link = text.match(//) break unless first_link.nil? end raise "Oops! seems that \"#{page_title}\" has no links" if first_link.nil? @page = @page.links_with(:href => /#{first_link[1]}/).first.click end end