Created
December 2, 2015 15:55
-
-
Save michaelhasan/2e61783c5c5ffd15a43d to your computer and use it in GitHub Desktop.
Scraping a web site with phantomjs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| phantom.onError = function(message, trace) { | |
| console.error('[PHANTOMJS ERROR] ' + message); | |
| trace.forEach(function(t) { | |
| console.error(' >> [' + t.line + '] ' + | |
| (t.function ? '[' + t.function + '] ' : '') + | |
| t.file || t.sourceURL); | |
| }); | |
| phantom.exit(1); | |
| }; | |
| var page = new WebPage(), testindex = 0, loadInProgress = false; | |
| page.onConsoleMessage = function(msg) { | |
| console.log(msg); | |
| }; | |
| page.onLoadStarted = function() { | |
| loadInProgress = true; | |
| }; | |
| page.onLoadFinished = function() { | |
| loadInProgress = false; | |
| }; | |
| var chains=[]; | |
| var currentYear = ""; | |
| var args=[]; | |
| var steps = [ | |
| loadLoginPage, | |
| enterCredentials, | |
| submitLoginForm, | |
| loadYearPage, | |
| queueDownloads | |
| ]; | |
| function loadLoginPage() { | |
| console.log("Loading login page"); | |
| page.open("http://dontbreakthechain.com/accounts/login"); | |
| } | |
| function enterCredentials() { | |
| console.log("Entering Credentials"); | |
| page.evaluate(function() { | |
| var arr=document.forms; | |
| var i; | |
| for (i=0; i < arr.length; i++) { | |
| if (arr[i].getAttribute('method') == "post" && arr[i].getAttribute('action') == "/accounts/login") { | |
| arr[i].elements["id_username"].value=""; | |
| arr[i].elements["id_password"].value=""; | |
| return; | |
| } | |
| } | |
| }); | |
| } | |
| function submitLoginForm() { | |
| console.log("Submitting Login Form"); | |
| page.evaluate(function() { | |
| var arr=document.forms; | |
| var i; | |
| for (i=0; i < arr.length; i++) { | |
| if (arr[i].getAttribute('method') == "post" && arr[i].getAttribute('action') == "/accounts/login") { | |
| arr[i].submit(); | |
| return; | |
| } | |
| } | |
| }); | |
| } | |
| function loadYearPage() { | |
| console.log("Loading Year Page to get current year and list of chains"); | |
| page.open("http://dontbreakthechain.com/year"); | |
| } | |
| function queueDownloads() { | |
| // get year from title | |
| currentYear=page.evaluate(function() { | |
| var year = document.querySelectorAll('.year-title')[0].childNodes[2].textContent; | |
| year=year.trim(); | |
| return year; | |
| }); | |
| console.log(" Current Year is " + currentYear); | |
| // get chains | |
| chains=page.evaluate(function() { | |
| var elements=document.querySelectorAll('#chains a'); | |
| var mychains=[]; | |
| for (i=0; i < elements.length; i++) { | |
| mychains[i] = new Object(); | |
| mychains[i].url=elements[i].href; | |
| mychains[i].name=elements[i].innerHTML.trim(); | |
| } | |
| try { | |
| return Array.prototype.map.call(mychains, function(mychain) { | |
| return {name: mychain.name, url:mychain.url}; | |
| }); | |
| } catch (e) { | |
| return []; | |
| } | |
| }); | |
| console.log(" " + chains.length + " chains found"); | |
| // queue up the steps for downloading each page | |
| var currentChain; | |
| for (currentChain=0; currentChain<chains.length; currentChain++) { | |
| var currentIndex=steps.length; | |
| steps[currentIndex]=loadChain; | |
| args[currentIndex]=[chains[currentChain].name, chains[currentChain].url]; | |
| var year; | |
| for (year=currentYear; year>=2007; year--) { | |
| currentIndex=steps.length; | |
| steps[currentIndex]=loadSpecificYear; | |
| args[currentIndex]=[year.toString()]; | |
| currentIndex=steps.length; | |
| steps[currentIndex]=dumpPage; | |
| args[currentIndex]=[chains[currentChain].name, year.toString()]; | |
| } | |
| } | |
| } | |
| function loadChain(name, url) { | |
| console.log("Loading chain " + name + " at url " + url); | |
| page.open(url); | |
| } | |
| function loadSpecificYear(year) { | |
| console.log("Loading year " + year); | |
| page.open("http://dontbreakthechain.com/year?y=" + year); | |
| } | |
| function dumpPage(chain, year) { | |
| var csv = page.evaluate(pageToCSV); | |
| var fs = require('fs'); | |
| if (!fs.isWritable(".")) { | |
| console.error('current directory is not writable!'); | |
| phantom.exit(1); | |
| } | |
| console.log('Writing file ./' + chain + '_' + year + '.csv ...' ); | |
| fs.write('./' + chain + '_' + year + '.csv', csv, 'a'); | |
| } | |
| function pageToCSV() { | |
| var csv="", days = document.querySelectorAll('.day'); | |
| for (i=0; i < days.length; i++) { | |
| var present; | |
| if (days[i].className.trim() === 'day day-hover link') { | |
| present="1"; | |
| } else { | |
| present="0"; | |
| } | |
| csv=csv + days[i].id + "," + present + "\n"; | |
| } | |
| return csv; | |
| } | |
| interval = setInterval(function() { | |
| if (!loadInProgress && typeof steps[testindex] == "function") { | |
| if (typeof args[testindex] === 'undefined') { | |
| steps[testindex](); | |
| } | |
| else { | |
| steps[testindex].apply(this, args[testindex]); | |
| } | |
| testindex++; | |
| } | |
| if (typeof steps[testindex] != "function") { | |
| console.log("Download complete"); | |
| phantom.exit(); | |
| } | |
| }, 25); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment