Created
March 16, 2019 18:34
-
-
Save rcdilorenzo/8c2004ed29b06fd73a4d3ddb1c3e0a50 to your computer and use it in GitHub Desktop.
A puppeteer script to download all discussions for a given WorldClass topic (assumes WORLDCLASS_DOMAIN environment is set)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const read = require('read'); | |
const htmlToText = require('html-to-text').fromString; | |
const R = require('ramda'); | |
const Promise = require('bluebird'); | |
const fs = require('fs'); | |
const download = require('@jinphen/download2'); | |
const { CookieJar } = require('tough-cookie'); | |
const mapSeries = R.flip(Promise.mapSeries); | |
const extractPosts = () => { | |
return Array.from( | |
document.getElementsByClassName('d2l-datalist-item d2l-datalist-simpleitem') | |
).map(section => { | |
const link = section.querySelector('.d2l-linkheading-link').href; | |
const title = section.querySelector('.d2l-linkheading-link').title; | |
const subtitle = section.querySelector('.d2l-textblock-secondary').innerText; | |
const components = subtitle.split(' posted '); | |
const author = components[0]; | |
const postedAt = components[1]; | |
const html = section.querySelector('.d2l-htmlblock').innerHTML; | |
const attachments = Array | |
.from(section.querySelectorAll('.d2l-filelink-text')) | |
.map(node => node.href); | |
return { title, author, link, postedAt, html, attachments }; | |
}); | |
}; | |
const extractComments = () => { | |
return Array.from( | |
document.querySelector('.d2l-datalist').children | |
).map(section => { | |
const author = section.querySelector('.d2l-heading').innerText; | |
const postedAt = section.querySelector('.d2l-fuzzydate').innerText; | |
const html = section.querySelector('.d2l-htmlblock').innerHTML; | |
return { author, postedAt, html }; | |
}) | |
}; | |
const readVariable = options => { | |
return new Promise((resolve, reject) => { | |
read(options, function(er, value) { | |
(er || value == '') ? reject(err) : resolve(value); | |
}); | |
}); | |
} | |
const interpretHTML = post => { | |
return R.pipe( | |
R.assoc('text', htmlToText(post.html, { wordwrap: 100 })), | |
R.omit(['html']) | |
)(post); | |
}; | |
const downloadAttachment = R.curry((outputFolder, cookies, author, url) => { | |
const cookieHeader = cookies.map(c => `${c.name}=${c.value}`).join('; '); | |
const authorSlug = author.toLowerCase().replace(' ', '-'); | |
const folder = `${outputFolder}/attachments/${authorSlug}`; | |
return download(url, folder, { headers: { 'Cookie': cookieHeader } }) | |
.then(({ data, filename }) => `${folder}/${filename}`) | |
.then(R.tap(filename => console.log(`Downloaded ${filename}`))); | |
}); | |
(async () => { | |
const username = await readVariable({ prompt: 'Username: ' }); | |
const password = await readVariable({ | |
prompt: 'Password [not shown]: ', | |
silent: true | |
}); | |
const topicURL = await readVariable({ prompt: 'Topic URL: ' }); | |
const topicName = await readVariable({ prompt: 'Topic Name: ' }); | |
const folder = await readVariable({ prompt: 'Folder Name: ' }); | |
const browser = await puppeteer.launch(); | |
const page = await browser.newPage(); | |
console.log('Logging in...'); | |
await page.goto( | |
`https://${process.env.WORLDCLASS_DOMAIN}`, | |
{ waitUntil: 'networkidle2' } | |
); | |
await page.type('#username', username); | |
await page.type('#password', password); | |
await page.click('[name="Login"]'); | |
console.log('Extracting posts...') | |
await page.waitForSelector('.course-image-container'); | |
await page.goto(topicURL, { waitUntil: 'networkidle2' }); | |
const posts = await page.evaluate(extractPosts) | |
.then(R.map(interpretHTML)) | |
.then(R.tap(_ => console.log('Extracting comments...'))) | |
.then(mapSeries(async (post) => { | |
// Download attachments | |
const cookies = await page.cookies(); | |
const attachments = await mapSeries( | |
downloadAttachment(folder, cookies, post.author), | |
post.attachments | |
); | |
// Download comments | |
console.log(`Getting comments for "${post.title}"...`) | |
await page.goto(post.link, { waitUntil: 'networkidle2' }); | |
const comments = await page.evaluate(extractComments) | |
.then(R.map(interpretHTML)); | |
return { ...post, comments, attachments }; | |
})); | |
const filename = `${folder}/${topicName}-topic.json`; | |
fs.writeFileSync(filename, JSON.stringify(posts, null, 2)); | |
console.log(`Topic saved to ${filename}`); | |
await browser.close(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment