Created
May 14, 2019 03:31
-
-
Save jsbonso/85c65896b4054d69594f27c6ae28897a to your computer and use it in GitHub Desktop.
NodeJS Script for Data Fetch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "coral-livefyre-api-scraper", | |
"version": "1.0.0", | |
"description": " API scraper to fetch data and generate dump file", | |
"main": "index.js", | |
"scripts": { | |
"build": "rimraf dist/ && babel ./ --out-dir dist/ --ignore ./node_modules,./.babelrc,./package.json,./npm-debug.log --copy-files", | |
"start": "babel-node --presets es2015 index.js" | |
}, | |
"keywords": [ | |
"coral" | |
], | |
"author": "Jon Bonso", | |
"license": "ISC", | |
"dependencies": { | |
"axios": "^0.18.0", | |
"node-fetch": "^2.2.0", | |
"request": "^2.88.0", | |
"request-promise": "^4.2.2" | |
}, | |
"devDependencies": { | |
"babel-cli": "^6.26.0", | |
"babel-preset-es2015": "^6.24.1", | |
"rimraf": "^2.6.2" | |
} | |
} | |
-- | |
import fs from 'fs'; | |
import request from 'request'; | |
import rp from 'request-promise'; | |
import fetch from 'node-fetch'; | |
const DATE_SUFFIX = "000"; | |
let cList = fs.readFileSync('c_ids.txt').toString().split("\n"); | |
const siteId = process.argv.slice(2); | |
console.log('Provided SITE ID: ' + siteId); | |
/** | |
* Fetch data | |
*/ | |
const start = async () => { | |
await asyncForEach(cList, async (cId) => { | |
console.log('START: ' + cId) | |
let site_id = siteId; //342837 for DT | |
let buffer = new Buffer(cId); | |
let base64cId = buffer.toString('base64'); | |
let network_name = "ndm"; | |
const BASE_URL = 'https://' + network_name + '.bootstrap.fyre.co/bs3/v3.1/' + network_name + '.fyre.co/' + site_id + '/' + base64cId; | |
const l_ARTICLE_INIT_URL = BASE_URL + '/init'; | |
await waitFor(cId, l_ARTICLE_INIT_URL); | |
console.log('END: ' + cId) | |
}).catch(err => { | |
console.log(err); | |
}); | |
console.log('Done') | |
} | |
/** | |
* Loop through all c Ids | |
* @param {*} array | |
* @param {*} callback | |
*/ | |
async function asyncForEach(array, callback) { | |
console.log('arr len' + array.length); | |
for (let index = 0; index < array.length; index++) { | |
await callback(array[index], index, array) | |
} | |
} | |
/** | |
* | |
* @param {*} url | |
*/ | |
const waitFor = (cId, url) => new Promise((resolve, reject) => { | |
return fetch(url).then(res => res.json()) | |
.then(async response => { | |
try { | |
let commentsList = []; | |
let allComments = []; | |
let allUsers = []; | |
let commentsPagesUrls = []; | |
let BASE_URL = url.substring(0, url.length - 4); | |
let network_name = "ndm"; | |
console.log(JSON.stringify(url)); | |
const pages = response.collectionSettings.archiveInfo.nPages; | |
// Populate all the pages URL from archiveInfo.nPages | |
for (var i = 0; i < pages; i++) { | |
commentsPagesUrls.push(BASE_URL + i + '.json'); | |
} | |
let requests = commentsPagesUrls.map(url => fetch(url)); | |
/** | |
* Fetch all comments | |
*/ | |
await Promise.all(requests) | |
.then(responses => { | |
for (let response of responses) { | |
// for debugging | |
// console.log(`${response.url}: ${response.status}`); // shows 200 for every url | |
} | |
return responses; | |
}) | |
// map array of responses into array of response.json() to read their content | |
.then(responses => Promise.all(responses.map(r => r.json()))) | |
.then(pageComments => pageComments.forEach(pageComment => { | |
//console.log(pageComment.content.length); | |
allComments.push(pageComment.content); | |
allUsers.push(pageComment.authors); | |
})) | |
.then(() => { | |
//console.log('Comments total: ' + JSON.stringify(allComments[0].length)); | |
//console.log('Users total: ' + JSON.stringify(allUsers.length)); | |
// Fetch Comments | |
allComments.forEach(function (comments) { | |
comments.forEach(function (comment) { | |
if (comment.vis == 1 || comment.vis == 4) { | |
let commentData = {}; | |
commentData.body_html = comment.content.bodyHtml; | |
commentData.state = comment.vis; | |
let numCreatedDate = (new Date(parseInt(comment.content.createdAt + DATE_SUFFIX))).toISOString(); | |
numCreatedDate = numCreatedDate.substring(0, numCreatedDate.length - 5); | |
commentData.created = numCreatedDate; | |
commentData.id = comment.content.id; | |
let author = comment.content.authorId; | |
let parentId = comment.content.parentId | |
author = author.replace("@" + network_name + ".fyre.co", ""); | |
commentData.author_id = author; | |
if (parentId) { | |
commentData.parent_id = parentId; | |
} | |
commentsList.push(commentData); | |
} | |
}); | |
}); | |
/** | |
* Fetch Comments metadata details | |
*/ | |
let collectionSettings = response.collectionSettings; | |
let data = {}; | |
data.allow_comments = true; | |
if (pages > 0) { | |
const firstCommentDate = response.collectionSettings.archiveInfo.pageInfo[0].first; | |
let numCreatedDate = (new Date(parseInt(firstCommentDate + DATE_SUFFIX))).toISOString(); | |
numCreatedDate = numCreatedDate.substring(0, numCreatedDate.length - 5); | |
data.created = numCreatedDate; | |
data.title = collectionSettings.title; | |
data.comments = commentsList; | |
data.source = collectionSettings.url; | |
data.article_identifier = cId; | |
//fs.writeFile('site_id_comments.json', body, 'utf8'); | |
fs.appendFile('site_id_comments.json', JSON.stringify(data) + "\n", 'utf8', (err) => { | |
if (err) throw err; | |
console.log('The file has been saved!'); | |
}); | |
var userStream = fs.createWriteStream("users.json", { flags: 'a' }); | |
// Fetch Users | |
allUsers.forEach(function (users) { | |
let usersArray = []; | |
Object.keys(users).forEach(key => { | |
usersArray.push(users[key]); | |
}); | |
//console.log('userArr: ' + usersArray.length); | |
usersArray.forEach(function (user) { | |
let lUser = {}; | |
lUser.display_name = user.displayName; | |
let id = user.id; | |
id = id.replace("@" + network_name + ".fyre.co", ""); | |
lUser.id = id; | |
lUser.email = user.id; | |
userStream.write(JSON.stringify(lUser) + "\n", 'utf8'); | |
}) | |
}); | |
userStream.end(); | |
} | |
}).catch(error => { | |
console.log(error); | |
}) //promise | |
} catch (error) { | |
console.log(error); | |
} | |
// Resolve | |
resolve(); | |
}, error => { | |
reject(new Error(error.message)) | |
}) | |
}) | |
start(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment