Created
December 11, 2017 00:58
-
-
Save andrefs/820205a58a4c8ffa4e4c0a780a948b2e to your computer and use it in GitHub Desktop.
Script to retrieve news articles from a MongoDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const mongo = require('promised-mongo'); | |
const db = mongo('metacache', ['contents']); | |
const fs = require('fs-extra'); | |
const htmlToText = require('html-to-text'); | |
const Promise = require('bluebird'); | |
async function dumpDocs(filter, folder, fields){ | |
console.log('['+(new Date().toISOString())+'] Dumping',filter,'into '+folder); | |
const _filter = { | |
...filter, | |
'Source.Name': 'Observador', | |
Type: "sapo.obj.creativework.article", | |
Body: {$exists: true}, | |
$where: 'this.Body.length > 100' | |
}; | |
const docs = await db.contents.find(_filter).toArray(); | |
return Promise.mapSeries(docs, | |
doc => { | |
const _fields = fields || ['Pretitle','Title','Subtitle', 'Lead']; | |
let txt = _fields.map(f => doc[f]); | |
if(doc.Body) { | |
const text = htmlToText.fromString(doc.Body); | |
txt.push(text); | |
} | |
return fs.writeFile(folder+'/'+doc._id, txt.join('\n')); | |
}); | |
} | |
dumpDocs({'CategoryPaths':'Economia'}, './_/economia') | |
.then(() => dumpDocs({'CategoryPaths':'Desporto'}, './_/desporto')) | |
.then(() => { | |
console.log('Finished'); | |
process.exit(0); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment