Created
September 7, 2019 09:11
-
-
Save michalbcz/0fd5e3249ad541376412eb63ac39a0e6 to your computer and use it in GitHub Desktop.
Revisions
-
michalbcz renamed this gist
Sep 7, 2019 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
michalbcz created this gist
Sep 7, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,175 @@ const puppeteer = require('puppeteer') const fs = require('fs') // this wrapper means immediatelly execute this code void(async () => { const url = 'https://zbranekvalitne.cz/zbrojni-prukaz/testove-otazky' try { console.log("I am scraping questions from " + url) const browser = await puppeteer.launch({ // headless: false, // launch headful mode - good for debugging purposes (you will see what happened inside browser) // slowMo: 50, // slow down puppeteer script so that it's easier to follow visually }) const page = await browser.newPage() // use nodejs console logging from inside of evaluate (where scraping script is running) /* page.on('console', consoleMessageObject => function (consoleMessageObject) { if (consoleMessageObject._type !== 'warning') { console.log(consoleMessageObject._text) } }); */ await page.goto(url) console.log("Browser opened, starting to evaluate scraping script...") console.log("Scraping groups and its links") const groups = await extractGroup(page) let allQuestions = [] for (let i = 0; i < groups.length; i++) { const group = groups[i] if (groupHasSubgroups(group, groups)) { console.log( ` Skipping group ${group.groupId} with sub groups. Otherwise we would have duplicated questions (from parent group and its subgroups), because parent group link leads to all questions for it's subgroups. ` ) continue // skip this group } console.log("Extracting questions for group", group) await Promise.all([ page.evaluate((groupName) => { let groupItems = document.querySelectorAll('form#setup div.menu div.item') let groupItemElement = Array.from(groupItems).find((groupItem) => groupItem.textContent.includes(groupName)) groupItemElement.click() }, group.name), page.waitForNavigation() ]) const questions = await page.evaluate(extractQuestions) let questionsForGroup = questions.map((question, index) => { let newQuestion = { ...question } newQuestion.groupId = group.groupId return newQuestion }) allQuestions = allQuestions.concat(questionsForGroup) } await browser.close() console.log('Scraping is done. Browser is closed. We scraped', allQuestions.length, 'questions') //questions.forEach(it => console.log(it)) const resultJson = { groups: groups, questions: allQuestions } const fileUri = './questions.json' console.log('Writing questions to file:', fileUri) fs.writeFile(fileUri, JSON.stringify(resultJson, null, '\t' /* pretty-print */), (err) => { if (err) { console.error("Cannot write file questions.json", err) } console.info("File saved! Goodbye!") }) } catch (error) { console.error(error) } })(); function extractQuestions() { console.log('Extracting questions...') const questionsParentDiv = document.querySelectorAll('div#questions > div') const questions = Array.from(questionsParentDiv).map((el, index) => { const rows = el.querySelectorAll('div.row') const questionText = rows[0].innerText.trim() const question = { order: index, question: { text : questionText }, answers: [] } // answers for (let i = 1; i < rows.length; i++) { let answerRow = rows[i]; const isCorrect = answerRow.className.includes("correct-answer") const rawAnswerText = answerRow.innerText const answerText = rawAnswerText.replace(/^[a-z]\)/,"").trim() question.answers.push({ answerText: answerText, isCorrect: isCorrect }) } return question }) return questions } async function extractGroup(page) { const groupLinks = await page.$$('form#setup div.menu div.item') console.log("Group links size", groupLinks.length) let nextParentGroupId = null; let groups = [] for(let i = 1; i < groupLinks.length; i++) { const groupLinkElement = groupLinks[i] const thisIsSubGroup = await groupLinkElement.$("i.level") if (!thisIsSubGroup) { nextParentGroupId = null } let groupName = await groupLinkElement.$("span") != null ? await groupLinkElement.$eval("span", (node) => node.textContent) : await page.evaluate((node) => node.textContent, groupLinkElement) // first span contains name } groupName = groupName.replace(/\n/, '').trim() const group = { groupId: i, parentGroupId: nextParentGroupId, name: groupName } groups.push(group) // set parentGroupId for following subgroups if (!thisIsSubGroup) { nextParentGroupId = group.groupId } } return groups } function groupHasSubgroups(group, groups) { return groups.filter((it) => group.groupId === it.parentGroupId).length > 0 }