Skip to content

Instantly share code, notes, and snippets.

@andrefs
Created December 11, 2017 00:58
Show Gist options
  • Save andrefs/820205a58a4c8ffa4e4c0a780a948b2e to your computer and use it in GitHub Desktop.
Save andrefs/820205a58a4c8ffa4e4c0a780a948b2e to your computer and use it in GitHub Desktop.
Script to retrieve news articles from a MongoDB
const mongo = require('promised-mongo');
const db = mongo('metacache', ['contents']);
const fs = require('fs-extra');
const htmlToText = require('html-to-text');
const Promise = require('bluebird');
async function dumpDocs(filter, folder, fields){
console.log('['+(new Date().toISOString())+'] Dumping',filter,'into '+folder);
const _filter = {
...filter,
'Source.Name': 'Observador',
Type: "sapo.obj.creativework.article",
Body: {$exists: true},
$where: 'this.Body.length > 100'
};
const docs = await db.contents.find(_filter).toArray();
return Promise.mapSeries(docs,
doc => {
const _fields = fields || ['Pretitle','Title','Subtitle', 'Lead'];
let txt = _fields.map(f => doc[f]);
if(doc.Body) {
const text = htmlToText.fromString(doc.Body);
txt.push(text);
}
return fs.writeFile(folder+'/'+doc._id, txt.join('\n'));
});
}
dumpDocs({'CategoryPaths':'Economia'}, './_/economia')
.then(() => dumpDocs({'CategoryPaths':'Desporto'}, './_/desporto'))
.then(() => {
console.log('Finished');
process.exit(0);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment