Skip to content

Instantly share code, notes, and snippets.

@jsbonso
Created May 14, 2019 03:31
Show Gist options
  • Save jsbonso/85c65896b4054d69594f27c6ae28897a to your computer and use it in GitHub Desktop.
Save jsbonso/85c65896b4054d69594f27c6ae28897a to your computer and use it in GitHub Desktop.
NodeJS Script for Data Fetch
{
"name": "coral-livefyre-api-scraper",
"version": "1.0.0",
"description": " API scraper to fetch data and generate dump file",
"main": "index.js",
"scripts": {
"build": "rimraf dist/ && babel ./ --out-dir dist/ --ignore ./node_modules,./.babelrc,./package.json,./npm-debug.log --copy-files",
"start": "babel-node --presets es2015 index.js"
},
"keywords": [
"coral"
],
"author": "Jon Bonso",
"license": "ISC",
"dependencies": {
"axios": "^0.18.0",
"node-fetch": "^2.2.0",
"request": "^2.88.0",
"request-promise": "^4.2.2"
},
"devDependencies": {
"babel-cli": "^6.26.0",
"babel-preset-es2015": "^6.24.1",
"rimraf": "^2.6.2"
}
}
--
import fs from 'fs';
import request from 'request';
import rp from 'request-promise';
import fetch from 'node-fetch';
const DATE_SUFFIX = "000";
let cList = fs.readFileSync('c_ids.txt').toString().split("\n");
const siteId = process.argv.slice(2);
console.log('Provided SITE ID: ' + siteId);
/**
* Fetch data
*/
const start = async () => {
await asyncForEach(cList, async (cId) => {
console.log('START: ' + cId)
let site_id = siteId; //342837 for DT
let buffer = new Buffer(cId);
let base64cId = buffer.toString('base64');
let network_name = "ndm";
const BASE_URL = 'https://' + network_name + '.bootstrap.fyre.co/bs3/v3.1/' + network_name + '.fyre.co/' + site_id + '/' + base64cId;
const l_ARTICLE_INIT_URL = BASE_URL + '/init';
await waitFor(cId, l_ARTICLE_INIT_URL);
console.log('END: ' + cId)
}).catch(err => {
console.log(err);
});
console.log('Done')
}
/**
* Loop through all c Ids
* @param {*} array
* @param {*} callback
*/
async function asyncForEach(array, callback) {
console.log('arr len' + array.length);
for (let index = 0; index < array.length; index++) {
await callback(array[index], index, array)
}
}
/**
*
* @param {*} url
*/
const waitFor = (cId, url) => new Promise((resolve, reject) => {
return fetch(url).then(res => res.json())
.then(async response => {
try {
let commentsList = [];
let allComments = [];
let allUsers = [];
let commentsPagesUrls = [];
let BASE_URL = url.substring(0, url.length - 4);
let network_name = "ndm";
console.log(JSON.stringify(url));
const pages = response.collectionSettings.archiveInfo.nPages;
// Populate all the pages URL from archiveInfo.nPages
for (var i = 0; i < pages; i++) {
commentsPagesUrls.push(BASE_URL + i + '.json');
}
let requests = commentsPagesUrls.map(url => fetch(url));
/**
* Fetch all comments
*/
await Promise.all(requests)
.then(responses => {
for (let response of responses) {
// for debugging
// console.log(`${response.url}: ${response.status}`); // shows 200 for every url
}
return responses;
})
// map array of responses into array of response.json() to read their content
.then(responses => Promise.all(responses.map(r => r.json())))
.then(pageComments => pageComments.forEach(pageComment => {
//console.log(pageComment.content.length);
allComments.push(pageComment.content);
allUsers.push(pageComment.authors);
}))
.then(() => {
//console.log('Comments total: ' + JSON.stringify(allComments[0].length));
//console.log('Users total: ' + JSON.stringify(allUsers.length));
// Fetch Comments
allComments.forEach(function (comments) {
comments.forEach(function (comment) {
if (comment.vis == 1 || comment.vis == 4) {
let commentData = {};
commentData.body_html = comment.content.bodyHtml;
commentData.state = comment.vis;
let numCreatedDate = (new Date(parseInt(comment.content.createdAt + DATE_SUFFIX))).toISOString();
numCreatedDate = numCreatedDate.substring(0, numCreatedDate.length - 5);
commentData.created = numCreatedDate;
commentData.id = comment.content.id;
let author = comment.content.authorId;
let parentId = comment.content.parentId
author = author.replace("@" + network_name + ".fyre.co", "");
commentData.author_id = author;
if (parentId) {
commentData.parent_id = parentId;
}
commentsList.push(commentData);
}
});
});
/**
* Fetch Comments metadata details
*/
let collectionSettings = response.collectionSettings;
let data = {};
data.allow_comments = true;
if (pages > 0) {
const firstCommentDate = response.collectionSettings.archiveInfo.pageInfo[0].first;
let numCreatedDate = (new Date(parseInt(firstCommentDate + DATE_SUFFIX))).toISOString();
numCreatedDate = numCreatedDate.substring(0, numCreatedDate.length - 5);
data.created = numCreatedDate;
data.title = collectionSettings.title;
data.comments = commentsList;
data.source = collectionSettings.url;
data.article_identifier = cId;
//fs.writeFile('site_id_comments.json', body, 'utf8');
fs.appendFile('site_id_comments.json', JSON.stringify(data) + "\n", 'utf8', (err) => {
if (err) throw err;
console.log('The file has been saved!');
});
var userStream = fs.createWriteStream("users.json", { flags: 'a' });
// Fetch Users
allUsers.forEach(function (users) {
let usersArray = [];
Object.keys(users).forEach(key => {
usersArray.push(users[key]);
});
//console.log('userArr: ' + usersArray.length);
usersArray.forEach(function (user) {
let lUser = {};
lUser.display_name = user.displayName;
let id = user.id;
id = id.replace("@" + network_name + ".fyre.co", "");
lUser.id = id;
lUser.email = user.id;
userStream.write(JSON.stringify(lUser) + "\n", 'utf8');
})
});
userStream.end();
}
}).catch(error => {
console.log(error);
}) //promise
} catch (error) {
console.log(error);
}
// Resolve
resolve();
}, error => {
reject(new Error(error.message))
})
})
start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment