Skip to content

Instantly share code, notes, and snippets.

@ifthenelse
Forked from n1k0/404checker.js
Created July 25, 2013 10:04
Show Gist options
  • Select an option

  • Save ifthenelse/6078412 to your computer and use it in GitHub Desktop.

Select an option

Save ifthenelse/6078412 to your computer and use it in GitHub Desktop.

Revisions

  1. @n1k0 n1k0 created this gist Jan 11, 2013.
    110 changes: 110 additions & 0 deletions 404checker.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,110 @@
    /**
    * This casper scipt checks for 404 internal links for a given root url.
    *
    * Usage:
    *
    * $ casperjs 404checker.js http://mysite.tld/
    * $ casperjs 404checker.js http://mysite.tld/ --max-depth=42
    */

    /*global URI*/

    var casper = require("casper").create({
    pageSettings: {
    loadImages: false,
    loadPlugins: false
    }
    });
    var checked = [];
    var currentLink = 0;
    var fs = require('fs');
    var upTo = ~~casper.cli.get('max-depth') || 100;
    var url = casper.cli.get(0);
    var baseUrl = url;
    var links = [url];
    var utils = require('utils');
    var f = utils.format;

    function absPath(url, base) {
    return new URI(url).resolve(new URI(base)).toString();
    }

    // Clean links
    function cleanLinks(urls, base) {
    return utils.unique(urls).filter(function(url) {
    return url.indexOf(baseUrl) === 0 || !new RegExp('^(#|ftp|javascript|http)').test(url);
    }).map(function(url) {
    return absPath(url, base);
    }).filter(function(url) {
    return checked.indexOf(url) === -1;
    });
    }

    // Opens the page, perform tests and fetch next links
    function crawl(link) {
    this.start().then(function() {
    this.echo(link, 'COMMENT');
    this.open(link);
    checked.push(link);
    });
    this.then(function() {
    if (this.currentHTTPStatus === 404) {
    this.warn(link + ' is missing (HTTP 404)');
    } else if (this.currentHTTPStatus === 500) {
    this.warn(link + ' is broken (HTTP 500)');
    } else {
    this.echo(link + f(' is okay (HTTP %s)', this.currentHTTPStatus));
    }
    });
    this.then(function() {
    var newLinks = searchLinks.call(this);
    links = links.concat(newLinks).filter(function(url) {
    return checked.indexOf(url) === -1;
    });
    this.echo(newLinks.length + " new links found on " + link);
    });
    }

    // Fetch all <a> elements from the page and return
    // the ones which contains a href starting with 'http://'
    function searchLinks() {
    return cleanLinks(this.evaluate(function _fetchInternalLinks() {
    return [].map.call(__utils__.findAll('a[href]'), function(node) {
    return node.getAttribute('href');
    });
    }), this.getCurrentUrl());
    }

    // As long as it has a next link, and is under the maximum limit, will keep running
    function check() {
    if (links[currentLink] && currentLink < upTo) {
    crawl.call(this, links[currentLink]);
    currentLink++;
    this.run(check);
    } else {
    this.echo("All done, " + checked.length + " links checked.");
    this.exit();
    }
    }

    if (!url) {
    casper.warn('No url passed, aborting.').exit();
    }

    casper.start('https://js-uri.googlecode.com/svn/trunk/lib/URI.js', function() {
    var scriptCode = this.getPageContent() + '; return URI;';
    window.URI = new Function(scriptCode)();
    if (typeof window.URI === "function") {
    this.echo('URI.js loaded');
    } else {
    this.warn('Could not setup URI.js').exit();
    }
    });

    casper.run(process);

    function process() {
    casper.start().then(function() {
    this.echo("Starting");
    }).run(check);
    }