Skip to content

Instantly share code, notes, and snippets.

@abhinavKeshri07
Created November 25, 2019 16:51
Show Gist options
  • Select an option

  • Save abhinavKeshri07/11a62145222fc091b83875e2c2863f31 to your computer and use it in GitHub Desktop.

Select an option

Save abhinavKeshri07/11a62145222fc091b83875e2c2863f31 to your computer and use it in GitHub Desktop.

Revisions

  1. abhinavKeshri07 created this gist Nov 25, 2019.
    127 changes: 127 additions & 0 deletions scrape.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,127 @@
    const cheerio = require('cheerio');
    const fs = require('fs');
    const readline = require('readline');
    const get_post_data = require('./get_POST_data');
    const get_get_data = require('./get_GET_data');

    let callback_url = 'https://xyz.com';

    // this form data object contain the query to be sent.
    let form_data = {
    'm_hc': '01',
    'm_sideflg': 'C',
    'm_sr': 'R',
    'm_skey': 'AO',
    'frmdate': '01-11-2018', // this value can be changed according to need. I am querying the database form this date.
    'todate': '21-11-2018', // this value can be changed according to need.
    'submit11': 'List By Case Type' // this field is neccessary for this particular request.
    };

    // neccessary headers set to make the post requets to casequery_action.php
    let headers = {
    'Accept': '*/*',
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'My post Script'
    }
    let $; // for cheerio
    const writeStream = fs.createWriteStream('allLinks.csv');

    // this counter keeps track of "how many entries" we got in this query.
    let LinkCounter = 0;

    let loadDataInFile = function() {
    get_post_data(callback_url, form_data, headers)
    .then((response) => {
    //console.log(response);
    $ = cheerio.load(response);
    $('font a').each((i, ele) => {
    const item = $(ele).text();
    const link = $(ele).attr('href');
    writeStream.write(`${item},https://xyz.com/${link}\n`);
    LinkCounter++;
    });
    console.log("done fetching data and stored it in csv file");
    // all the links to case have beeen stored in the "allLinks.csv" file.

    // now making call to store details in "details.json" file
    loadDetailInFile();
    return true;

    })
    .catch((error) => {
    console.log("Error while loadin data form server");
    console.log(error);
    return false;

    });

    };

    let loadDetailInFile = function() {
    let DetailCounter = 0;
    let readStream = fs.createReadStream('allLinks.csv');
    let detailWriteStream = fs.createWriteStream('detailCases.csv');
    let rl = readline.createInterface({
    input: readStream,
    terminal: false,
    preserveCursor: true
    });

    rl.on('line', function(line) {
    // reading each line of "allLinks.csv" file one by one
    get_get_data(line.split(',')[1])
    .then((response) => {
    console.log(response);
    console.log("\n\n\n\n\n");
    DetailCounter++;
    $ = cheerio.load(response);
    detail = {};
    //console.log($('select[name="m_resno"] option').text())
    detail['Petitioner'] = $('select[name="m_petno"] option').text();
    detail['Respondent'] = $('select[name="m_resno"] option').text();
    detail['Pent.Adv'] = $('select[name="m_padv"] option').text();
    //Similarly other details can be extracted .

    detailWriteStream.write(JSON.stringify(detail) + "\n");
    console.log(DetailCounter + " \n\n\n");
    if (DetailCounter >= LinkCounter) { rl.close(); return; }
    })
    .catch((error) => {
    if (error.message == "not 200 statuscode") {
    // auth failed so we need to again make the request for all the urls.
    console.log("Error geting case details. Again refreshing Links");
    LinkCounter = 0;
    rl.pause();
    funcH()
    .then(() => {
    rl.prompt();
    })
    .catch(() => {
    console.log("error occured while refetching all the links");
    })
    } else {
    console.log(error);
    }
    });


    });


    }

    let funcH = function() {
    return new Promise((resolve, reject) => {
    if (loadDataInFile()) {
    resolve({});

    } else {
    reject({});
    }
    })
    }




    loadDataInFile();