Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save michalbcz/0fd5e3249ad541376412eb63ac39a0e6 to your computer and use it in GitHub Desktop.

Select an option

Save michalbcz/0fd5e3249ad541376412eb63ac39a0e6 to your computer and use it in GitHub Desktop.

Revisions

  1. michalbcz renamed this gist Sep 7, 2019. 1 changed file with 0 additions and 0 deletions.
  2. michalbcz created this gist Sep 7, 2019.
    175 changes: 175 additions & 0 deletions zbranekvalitne-scraper.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,175 @@
    const puppeteer = require('puppeteer')
    const fs = require('fs')

    // this wrapper means immediatelly execute this code
    void(async () => {
    const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky'

    try {
    console.log("I am scraping questions from " + url)
    const browser = await puppeteer.launch({
    // headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser)
    // slowMo: 50, // slow down puppeteer script so that it's easier to follow visually
    })

    const page = await browser.newPage()

    // use nodejs console logging from inside of evaluate (where scraping script is running)
    /*
    page.on('console', consoleMessageObject => function (consoleMessageObject) {
    if (consoleMessageObject._type !== 'warning') {
    console.log(consoleMessageObject._text)
    }
    });
    */

    await page.goto(url)
    console.log("Browser opened, starting to evaluate scraping script...")

    console.log("Scraping groups and its links")
    const groups = await extractGroup(page)

    let allQuestions = []
    for (let i = 0; i < groups.length; i++) {
    const group = groups[i]

    if (groupHasSubgroups(group, groups)) {

    console.log(
    `
    Skipping group ${group.groupId} with sub groups. Otherwise we would have duplicated
    questions (from parent group and its subgroups), because parent group link leads to all questions for it's subgroups.
    `
    )

    continue // skip this group
    }

    console.log("Extracting questions for group", group)

    await Promise.all([
    page.evaluate((groupName) => {
    let groupItems = document.querySelectorAll('form#setup div.menu div.item')
    let groupItemElement = Array.from(groupItems).find((groupItem) => groupItem.textContent.includes(groupName))
    groupItemElement.click()
    }, group.name),
    page.waitForNavigation()
    ])

    const questions = await page.evaluate(extractQuestions)

    let questionsForGroup = questions.map((question, index) => {
    let newQuestion = { ...question }
    newQuestion.groupId = group.groupId

    return newQuestion
    })

    allQuestions = allQuestions.concat(questionsForGroup)
    }

    await browser.close()
    console.log('Scraping is done. Browser is closed. We scraped', allQuestions.length, 'questions')

    //questions.forEach(it => console.log(it))

    const resultJson = {
    groups: groups,
    questions: allQuestions
    }

    const fileUri = './questions.json'
    console.log('Writing questions to file:', fileUri)
    fs.writeFile(fileUri, JSON.stringify(resultJson, null, '\t' /* pretty-print */), (err) => {
    if (err) {
    console.error("Cannot write file questions.json", err)
    }

    console.info("File saved! Goodbye!")
    })

    } catch (error) {
    console.error(error)
    }


    })();

    function extractQuestions() {
    console.log('Extracting questions...')

    const questionsParentDiv = document.querySelectorAll('div#questions > div')
    const questions = Array.from(questionsParentDiv).map((el, index) => {
    const rows = el.querySelectorAll('div.row')

    const questionText = rows[0].innerText.trim()

    const question = {
    order: index,
    question: {
    text : questionText
    },
    answers: []
    }

    // answers
    for (let i = 1; i < rows.length; i++) {
    let answerRow = rows[i];

    const isCorrect = answerRow.className.includes("correct-answer")
    const rawAnswerText = answerRow.innerText
    const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim()

    question.answers.push({
    answerText: answerText,
    isCorrect: isCorrect
    })

    }

    return question

    })

    return questions
    }

    async function extractGroup(page) {
    const groupLinks = await page.$$('form#setup div.menu div.item')

    console.log("Group links size", groupLinks.length)

    let nextParentGroupId = null;
    let groups = []
    for(let i = 1; i < groupLinks.length; i++) {
    const groupLinkElement = groupLinks[i]

    const thisIsSubGroup = await groupLinkElement.$("i.level")

    if (!thisIsSubGroup) {
    nextParentGroupId = null
    }

    let groupName = await groupLinkElement.$("span") != null ? await groupLinkElement.$eval("span", (node) => node.textContent) : await page.evaluate((node) => node.textContent, groupLinkElement) // first span contains name }
    groupName = groupName.replace(/\n/, '').trim()

    const group = {
    groupId: i,
    parentGroupId: nextParentGroupId,
    name: groupName
    }

    groups.push(group)

    // set parentGroupId for following subgroups
    if (!thisIsSubGroup) {
    nextParentGroupId = group.groupId
    }
    }

    return groups
    }

    function groupHasSubgroups(group, groups) {
    return groups.filter((it) => group.groupId === it.parentGroupId).length > 0
    }