Created
October 15, 2019 09:23
-
-
Save aimuhire/4ba4c656137c33b7319830c081aaf73c to your computer and use it in GitHub Desktop.
Code Sample from An Extension Project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Welcome to the official Scrapper class | |
* | |
* Author: Arsene I. Muhire | |
* email:[email protected] | |
* | |
* Project Description: | |
* I was building a browser extension that would rely on data from a server. | |
* basically, the extension allows users to get answers tips pop up on each question page during their CISCO/CCNA assessment. | |
* I made this as a proof of concept... the first browser extension I worked on. | |
* | |
* Class Details: | |
* Used to scrape exam data from https://www.ccna7.com/ | |
* Because all answer pages are not formatted alike. We use strategies aka: selectors. | |
* We basically try one selector, if it returns a null or unusable object... we try selector NUMBER TWO. | |
* This class is used to extract all answers from an answer page. | |
* Until it get to work (|| !=) | |
* */ | |
const cheerio = require('cheerio') | |
var request = require('request'); | |
class Scrapper { | |
/** | |
* Creates the Scrapper Object | |
* @param URL required eg: "https://www.ccna7.com/ccna2-v5-03/ccna2-practice-final-exam-v5-03/" | |
* @return {Object} Scrapper Object. | |
*/ | |
constructor(URL) { | |
if(URL == null) | |
throw "URL required!" | |
this.pageUrl = URL | |
//selectors are used to retrieve data from an HTML page | |
this.titleSelectors = [ | |
"h3", | |
'div[class="ai-stem"]>strong', 'strong' | |
] | |
this.questionsSelectors = [ | |
'ol[class="wpProQuiz_list"] > li', | |
"div.entry-content > ol > li", | |
] | |
this.choicesSelectors = [ | |
'li[class="wpProQuiz_questionListItem"]', | |
'ul > li' | |
] | |
} | |
/** | |
* Get Exam JavaScript Object | |
* @return {Object} Object containing exam details, questions and answers. | |
*/ | |
async getExam() { | |
/** | |
* Get Exam JavaScript Object | |
* @return {Object} Object containing exam details, questions and answers. | |
*/ | |
//returns the page as an HTML string | |
var pageStr = await this.getPageString().then((result) => { | |
return result | |
}) | |
//load string into cheerio for easy data extraction | |
const $ = cheerio.load(pageStr) | |
var exam = {} | |
exam.name = this.prettifyString($("div.entry-content > h2").text()) | |
exam.pageUrl = this.pageUrl | |
exam.version = "" | |
exam.questions = [] | |
// get the questionsEl an Array of questions cheerio elements | |
var questionsEl = this.getElement($, this.questionsSelectors, $("body")) | |
if (questionsEl) | |
//loops through the questions list | |
questionsEl.each((questionIndex, questionEl) => { | |
var question = {} | |
try { | |
//extract question title | |
//eg: "What are the different Routing Protocols?" | |
var titleEl = this.getElement($, this.titleSelectors, questionEl) | |
if (titleEl) | |
var title = titleEl.text() | |
try { | |
// Removes Explanation text once found | |
var title = titleEl.text().split("Explanation:")[0] | |
} catch (error) { | |
console.log(error) | |
} | |
} catch (error) { | |
console.log(error) | |
} | |
question.title = this.prettifyString(title) | |
//the question.solution is an object of question choices, answers and explanation | |
var solutionResult = this.getSolution($, questionEl) | |
//Solutions are either multiple choices {CHOICES} or images solutions... | |
//Currently we save HTML_SOLUTIONS when we find images in the solution | |
if (solutionResult.state === "CHOICES") { | |
question.solution = { | |
choices: solutionResult.choices, | |
explanation: solutionResult.explanation | |
} | |
} else if (solutionResult.state === "HTML_SOLUTION") { | |
question.htmlSolution = solutionResult.htmlSolution | |
} else { | |
question.solution = solutionResult | |
} | |
// DO not save questions without a title | |
if (!question.title) | |
return | |
question.identifier = question.title.replace(/\ /g, "") | |
// PUSH the question into the questions array | |
exam.questions.push(question) | |
}) | |
//after looping through the questions, we return the Exam JS Object | |
return exam | |
} | |
/** | |
* retrieves and return a promise with the HTML page as a string, using the request module | |
* | |
*/ | |
getPageString() { | |
return new Promise((resolve, reject) => { | |
request(this.pageUrl, function (error, response, body) { | |
if (error) | |
reject(error) | |
resolve(body) | |
}) | |
}) | |
} | |
/** | |
* Retrieves a question solution from a cheerio element | |
* @param {Object} $ cheerio function | |
* @param {Object} questionEl the cheerio question element object | |
* @return solution object | |
*/ | |
getSolution($, questionEl) { | |
var solution = { choices: [] } | |
var hasAnswer = false | |
var choicesResult = this.getChoicesElement($, this.choicesSelectors, questionEl) | |
if (choicesResult.state === "CHOICES_ELEMENT") { | |
var choicesEl = choicesResult.element | |
solution.state = "CHOICES" | |
} else { | |
solution.state = "HTML_SOLUTION" | |
solution.htmlSolution = choicesResult.htmlSolution | |
hasAnswer = true | |
return solution | |
} | |
if (choicesEl) | |
choicesEl.each((choiceIndex, choiceEl) => { | |
var choice = {} | |
var cleanChoice = "" | |
//try catch hell? | |
try { | |
cleanChoice = this.prettifyString($(choiceEl).text()) | |
try { | |
cleanChoice = this.prettifyString($(choiceEl).text()).split("Explanation:")[0] | |
var expl = $('div[class="itemfeedback"]', choiceEl).text() || this.prettifyString($(choiceEl).text()).split("explanation")[1] | |
} catch (error) { | |
console.log("hi yaaaa", error) | |
} | |
if (expl) | |
solution.explanation = this.prettifyString(expl) | |
} catch (error) { | |
console.log("Error cleaning answer...", error) | |
cleanChoice = $(choiceEl).text() | |
} | |
choice.name = cleanChoice | |
if ($('span[style*="color"]', choiceEl).text()) { | |
hasAnswer = true | |
choice.isAnswer = true | |
solution.choices.push(choice) | |
} else { | |
choice.isAnswer = false | |
solution.choices.push(choice) | |
} | |
}) | |
if (hasAnswer) | |
return solution | |
// else | |
// console.log("Question has no answer: ", this.pageUrl, JSON.stringify(solution)) | |
return {} | |
} | |
/** | |
* Trim string and removes unnecessary characters from a string | |
* @param {String} input The string input | |
* @return {String} The prettified string | |
*/ | |
prettifyString(input) { | |
var output = ""; | |
try { | |
output = input.replace(/[\r\n( )]+/g, " ").trim() | |
output = input.replace(/Question\sID\s\d+/g, "") | |
} catch (error) { | |
} | |
return output | |
} | |
/** | |
* Retrieve an element | |
* @param {Object} $ The querying function | |
* @param {Array} selectors Array of Selectors | |
* @return {String} The prettified string | |
*/ | |
getElement($, selectors, root) { | |
var element = null | |
for (var i = 0; i < selectors.length; i++) { | |
element = $(selectors[i], root) | |
if (element && element.length > 0) { | |
return element | |
} | |
} | |
if (element) | |
console.log("####SelectorError__________", selectors, $(element).html()) | |
return false | |
} | |
/** | |
* Returns the choices cheerio element | |
* @param {Object} $ cheerio function | |
* @param {Array} selectors choice selectors | |
* @param {Object} root root element, from which to select through | |
*/ | |
getChoicesElement($, selectors, root) { | |
var element = null | |
var result = {} | |
for (var i = 0; i < selectors.length; i++) { | |
element = $(selectors[i], root) | |
if (element && element.length > 0) { | |
result.state = "CHOICES_ELEMENT" | |
result.element = element | |
return result | |
} | |
} | |
/** | |
* @todo Extract the images save them to disk. stop relying on their image links. | |
*/ | |
result.state = "HTML_SOLUTION" | |
result.htmlSolution = $(root).html() | |
return result | |
} | |
} | |
module.exports = Scrapper | |
// CODE Below is only used for demo purposes. It's not part of the original class file. | |
//could be run with node, cheerio and request modules installed | |
useClass() | |
async function useClass(){ | |
let ScObj = new Scrapper("https://www.ccna7.com/ccna2-v5-03/ccna2-practice-final-exam-v5-03/") | |
try { | |
let Exam = await ScObj.getExam() | |
console.log(Exam) | |
} catch (error) { | |
console.log("error while retrieving results",error) | |
} | |
/** | |
* Expected Output: | |
* | |
* { name: 'CCNA2 Practice Final Exam Answer v5.03 2016', | |
pageUrl: 'https://www.ccna7.com/ccna2-v5-03/ccna2-practice-final-exam-v5-03/', | |
version: '', | |
questions: | |
[ { title: 'A client is using SLAAC to obtain an IPv6 address for its interface. After an address has been generated and applied to the interface, what must the client do before it can begin to use this IPv6 address?', | |
solution: [Object], | |
identifier: 'AclientisusingSLAACtoobtainanIPv6addressforitsinterface.Afteranaddresshasbeengeneratedandappliedtotheinterface,whatmusttheclientdobeforeitcanbegintousethisIPv6address?' } | |
, //other questions...]} | |
*/ | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment