Skip to content

Instantly share code, notes, and snippets.

@michaelhasan
Created December 2, 2015 15:55
Show Gist options
  • Select an option

  • Save michaelhasan/2e61783c5c5ffd15a43d to your computer and use it in GitHub Desktop.

Select an option

Save michaelhasan/2e61783c5c5ffd15a43d to your computer and use it in GitHub Desktop.
Scraping a web site with phantomjs
phantom.onError = function(message, trace) {
console.error('[PHANTOMJS ERROR] ' + message);
trace.forEach(function(t) {
console.error(' >> [' + t.line + '] ' +
(t.function ? '[' + t.function + '] ' : '') +
t.file || t.sourceURL);
});
phantom.exit(1);
};
var page = new WebPage(), testindex = 0, loadInProgress = false;
page.onConsoleMessage = function(msg) {
console.log(msg);
};
page.onLoadStarted = function() {
loadInProgress = true;
};
page.onLoadFinished = function() {
loadInProgress = false;
};
var chains=[];
var currentYear = "";
var args=[];
var steps = [
loadLoginPage,
enterCredentials,
submitLoginForm,
loadYearPage,
queueDownloads
];
function loadLoginPage() {
console.log("Loading login page");
page.open("http://dontbreakthechain.com/accounts/login");
}
function enterCredentials() {
console.log("Entering Credentials");
page.evaluate(function() {
var arr=document.forms;
var i;
for (i=0; i < arr.length; i++) {
if (arr[i].getAttribute('method') == "post" && arr[i].getAttribute('action') == "/accounts/login") {
arr[i].elements["id_username"].value="";
arr[i].elements["id_password"].value="";
return;
}
}
});
}
function submitLoginForm() {
console.log("Submitting Login Form");
page.evaluate(function() {
var arr=document.forms;
var i;
for (i=0; i < arr.length; i++) {
if (arr[i].getAttribute('method') == "post" && arr[i].getAttribute('action') == "/accounts/login") {
arr[i].submit();
return;
}
}
});
}
function loadYearPage() {
console.log("Loading Year Page to get current year and list of chains");
page.open("http://dontbreakthechain.com/year");
}
function queueDownloads() {
// get year from title
currentYear=page.evaluate(function() {
var year = document.querySelectorAll('.year-title')[0].childNodes[2].textContent;
year=year.trim();
return year;
});
console.log(" Current Year is " + currentYear);
// get chains
chains=page.evaluate(function() {
var elements=document.querySelectorAll('#chains a');
var mychains=[];
for (i=0; i < elements.length; i++) {
mychains[i] = new Object();
mychains[i].url=elements[i].href;
mychains[i].name=elements[i].innerHTML.trim();
}
try {
return Array.prototype.map.call(mychains, function(mychain) {
return {name: mychain.name, url:mychain.url};
});
} catch (e) {
return [];
}
});
console.log(" " + chains.length + " chains found");
// queue up the steps for downloading each page
var currentChain;
for (currentChain=0; currentChain<chains.length; currentChain++) {
var currentIndex=steps.length;
steps[currentIndex]=loadChain;
args[currentIndex]=[chains[currentChain].name, chains[currentChain].url];
var year;
for (year=currentYear; year>=2007; year--) {
currentIndex=steps.length;
steps[currentIndex]=loadSpecificYear;
args[currentIndex]=[year.toString()];
currentIndex=steps.length;
steps[currentIndex]=dumpPage;
args[currentIndex]=[chains[currentChain].name, year.toString()];
}
}
}
function loadChain(name, url) {
console.log("Loading chain " + name + " at url " + url);
page.open(url);
}
function loadSpecificYear(year) {
console.log("Loading year " + year);
page.open("http://dontbreakthechain.com/year?y=" + year);
}
function dumpPage(chain, year) {
var csv = page.evaluate(pageToCSV);
var fs = require('fs');
if (!fs.isWritable(".")) {
console.error('current directory is not writable!');
phantom.exit(1);
}
console.log('Writing file ./' + chain + '_' + year + '.csv ...' );
fs.write('./' + chain + '_' + year + '.csv', csv, 'a');
}
function pageToCSV() {
var csv="", days = document.querySelectorAll('.day');
for (i=0; i < days.length; i++) {
var present;
if (days[i].className.trim() === 'day day-hover link') {
present="1";
} else {
present="0";
}
csv=csv + days[i].id + "," + present + "\n";
}
return csv;
}
interval = setInterval(function() {
if (!loadInProgress && typeof steps[testindex] == "function") {
if (typeof args[testindex] === 'undefined') {
steps[testindex]();
}
else {
steps[testindex].apply(this, args[testindex]);
}
testindex++;
}
if (typeof steps[testindex] != "function") {
console.log("Download complete");
phantom.exit();
}
}, 25);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment