Created
November 25, 2019 16:51
-
-
Save abhinavKeshri07/11a62145222fc091b83875e2c2863f31 to your computer and use it in GitHub Desktop.
this file shows you how to scrape data form a website. Thank me later.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const cheerio = require('cheerio'); | |
const fs = require('fs'); | |
const readline = require('readline'); | |
const get_post_data = require('./get_POST_data'); | |
const get_get_data = require('./get_GET_data'); | |
let callback_url = 'https://xyz.com'; | |
// this form data object contain the query to be sent. | |
let form_data = { | |
'm_hc': '01', | |
'm_sideflg': 'C', | |
'm_sr': 'R', | |
'm_skey': 'AO', | |
'frmdate': '01-11-2018', // this value can be changed according to need. I am querying the database form this date. | |
'todate': '21-11-2018', // this value can be changed according to need. | |
'submit11': 'List By Case Type' // this field is neccessary for this particular request. | |
}; | |
// neccessary headers set to make the post requets to casequery_action.php | |
let headers = { | |
'Accept': '*/*', | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'User-Agent': 'My post Script' | |
} | |
let $; // for cheerio | |
const writeStream = fs.createWriteStream('allLinks.csv'); | |
// this counter keeps track of "how many entries" we got in this query. | |
let LinkCounter = 0; | |
let loadDataInFile = function() { | |
get_post_data(callback_url, form_data, headers) | |
.then((response) => { | |
//console.log(response); | |
$ = cheerio.load(response); | |
$('font a').each((i, ele) => { | |
const item = $(ele).text(); | |
const link = $(ele).attr('href'); | |
writeStream.write(`${item},https://xyz.com/${link}\n`); | |
LinkCounter++; | |
}); | |
console.log("done fetching data and stored it in csv file"); | |
// all the links to case have beeen stored in the "allLinks.csv" file. | |
// now making call to store details in "details.json" file | |
loadDetailInFile(); | |
return true; | |
}) | |
.catch((error) => { | |
console.log("Error while loadin data form server"); | |
console.log(error); | |
return false; | |
}); | |
}; | |
let loadDetailInFile = function() { | |
let DetailCounter = 0; | |
let readStream = fs.createReadStream('allLinks.csv'); | |
let detailWriteStream = fs.createWriteStream('detailCases.csv'); | |
let rl = readline.createInterface({ | |
input: readStream, | |
terminal: false, | |
preserveCursor: true | |
}); | |
rl.on('line', function(line) { | |
// reading each line of "allLinks.csv" file one by one | |
get_get_data(line.split(',')[1]) | |
.then((response) => { | |
console.log(response); | |
console.log("\n\n\n\n\n"); | |
DetailCounter++; | |
$ = cheerio.load(response); | |
detail = {}; | |
//console.log($('select[name="m_resno"] option').text()) | |
detail['Petitioner'] = $('select[name="m_petno"] option').text(); | |
detail['Respondent'] = $('select[name="m_resno"] option').text(); | |
detail['Pent.Adv'] = $('select[name="m_padv"] option').text(); | |
//Similarly other details can be extracted . | |
detailWriteStream.write(JSON.stringify(detail) + "\n"); | |
console.log(DetailCounter + " \n\n\n"); | |
if (DetailCounter >= LinkCounter) { rl.close(); return; } | |
}) | |
.catch((error) => { | |
if (error.message == "not 200 statuscode") { | |
// auth failed so we need to again make the request for all the urls. | |
console.log("Error geting case details. Again refreshing Links"); | |
LinkCounter = 0; | |
rl.pause(); | |
funcH() | |
.then(() => { | |
rl.prompt(); | |
}) | |
.catch(() => { | |
console.log("error occured while refetching all the links"); | |
}) | |
} else { | |
console.log(error); | |
} | |
}); | |
}); | |
} | |
let funcH = function() { | |
return new Promise((resolve, reject) => { | |
if (loadDataInFile()) { | |
resolve({}); | |
} else { | |
reject({}); | |
} | |
}) | |
} | |
loadDataInFile(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment