Skip to content

Instantly share code, notes, and snippets.

@nmalkin
Forked from jedp/sample.js
Created June 7, 2012 17:32
Show Gist options
  • Select an option

  • Save nmalkin/2890222 to your computer and use it in GitHub Desktop.

Select an option

Save nmalkin/2890222 to your computer and use it in GitHub Desktop.

Revisions

  1. nmalkin revised this gist Jun 7, 2012. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions sample.js
    Original file line number Diff line number Diff line change
    @@ -108,6 +108,7 @@ function amplifyData(list, times, callback) {
    sample_rate: blob.sample_rate,
    timestamp: blob.timestamp, // already rounded off
    lang: blob.lang,
    number_sites_logged_in: blob.number_sites_logged_in,
    user_agent: blob.user_agent
    });
    }
  2. nmalkin revised this gist Jun 7, 2012. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions sample.js
    Original file line number Diff line number Diff line change
    @@ -133,6 +133,6 @@ var generate = module.exports.generate = function generate(wantCount, callback)
    if (!module.parent) {
    var wantCount = parseInt(process.argv[process.argv.length-1], 10) || 1000;
    generate(wantCount, function(data) {
    process.stdout.write(JSON.stringify(data));
    process.stdout.write(JSON.stringify(data, null, 4));
    });
    }
    }
  3. @jedp jedp revised this gist Jun 7, 2012. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion sample.js
    Original file line number Diff line number Diff line change
    @@ -60,7 +60,8 @@ function filterDataDump(obj, callback) {
    blob = obj[key].value;
    if (typeof blob.event_stream === 'object' &&
    typeof blob.user_agent === 'object' &&
    blob.user_agent.os !== 'Undefined') {
    blob.user_agent.os !== 'Undefined' &&
    blob.number_sites_logged_in) {
    goodData.push(blob);
    }
    });
  4. @jedp jedp created this gist Jun 7, 2012.
    137 changes: 137 additions & 0 deletions sample.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,137 @@
    #!/usr/bin/env node

    /**
    * Create sample interaction data, using valid kpiggybank
    * data as a seed.
    *
    * Can use from the command line, like so:
    *
    * node sample.js 42 # gets 42 data blobs
    *
    * Or from a module:
    *
    * var sampler = require('./sampler');
    *
    * // Generate 10000 data points based on kpi data
    * sampler.generate(10000, function(data) { ... });
    *
    * The program will always read all KPI data from kpiggybank,
    * and create extra sample data as necessary by jittering original
    * data blobs. It ensures that each event takes at least about
    * 1/4 second.
    */

    var http = require('http');

    var KPIGGYBANK = 'kpiggybank.hacksign.in';
    var JITTER_FACTOR = 0.2;

    /**
    * Download data from kpiggybank.hacksign.in
    */
    function getDataDump(callback) {
    var data = '';
    http.get({
    host: KPIGGYBANK,
    path: '/wsapi/interaction_data'
    }, function(res) {
    res.on('data', function(chunk) {
    data += chunk;
    });

    res.on('end', function() {
    return callback(JSON.parse(data.toString()));
    });
    });
    }

    /**
    * Extract the kpi blob from the couch record, keeping
    * only those that have all the kpi fields we want.
    * (Some may be missing timestamp, user_agent, etc.)
    *
    * Accepts obj as a dictionary of data; calls back with
    * a list.
    */
    function filterDataDump(obj, callback) {
    var goodData = [];
    var blob;
    Object.keys(obj).forEach(function(key) {
    blob = obj[key].value;
    if (typeof blob.event_stream === 'object' &&
    typeof blob.user_agent === 'object' &&
    blob.user_agent.os !== 'Undefined') {
    goodData.push(blob);
    }
    });
    return callback(goodData);
    }

    /**
    * Jitter each event in a stream
    */
    function jitterEvents(eventStream) {
    var newStream = [];
    var jitter;
    var fastest;

    eventStream.forEach(function(tuple, index) {
    // Note - this will make xhr events look like they took
    // at least 1/4 sec. I'm assuming you don't care about
    // xhr events.
    jitter = Math.floor(tuple[1] * (Math.random() - 0.5) * (JITTER_FACTOR * 2));
    fastest = 250 + Math.floor((Math.random() - 0.5) * 100);
    newStream.push([
    tuple[0], // the original event name
    Math.max(fastest, tuple[1] + jitter) // Not about 1/4 sec
    ]);
    });
    return newStream;
    }

    /**
    * Take each data point and multiply it a number of times,
    * each time jittering the data slightly.
    */
    function amplifyData(list, times, callback) {
    var moreData = [];
    var blob;
    var i;

    list.forEach(function(blob, index) {
    moreData.push(blob);
    for (i=0; i<times; i++) {
    moreData.push({
    _id: blob._id + '-' + i, // tag as a dup
    event_stream: jitterEvents(blob.event_stream),
    sample_rate: blob.sample_rate,
    timestamp: blob.timestamp, // already rounded off
    lang: blob.lang,
    user_agent: blob.user_agent
    });
    }
    });
    return callback(moreData);
    }

    var generate = module.exports.generate = function generate(wantCount, callback) {
    getDataDump(function(obj) {
    filterDataDump(obj, function(data) {
    if (wantCount > data.length) {
    var times = Math.ceil(wantCount / data.length);
    amplifyData(data, times, function(moreData) {
    return callback(moreData.slice(0, wantCount));
    });
    } else {
    return callback(data.slice(0, wantCount));
    }
    });
    });
    }

    if (!module.parent) {
    var wantCount = parseInt(process.argv[process.argv.length-1], 10) || 1000;
    generate(wantCount, function(data) {
    process.stdout.write(JSON.stringify(data));
    });
    }