node crawl keywordnya
Created
September 16, 2019 11:24
-
-
Save emsifa/a6ffb458e75fa8549984c3b83cc79610 to your computer and use it in GitHub Desktop.
Puppeteer Crawling Kompas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const request = require('request-promise'); | |
const cheerio = require('cheerio'); | |
const fs = require('fs'); | |
const keyword = process.argv[2]; | |
const width = 1366; | |
const height = 768; | |
console.log({keyword}); | |
async function crawlArticles(searchResults) { | |
const articles = []; | |
for (let i = 0; i < searchResults.length; i++) { | |
const data = searchResults[i]; | |
const url = data.url; | |
console.log('Grab html from: ' + url); | |
const html = await request(url); | |
const $ = cheerio.load(html); | |
const content = $('.read__content').text(); | |
articles.push(content.trim()); | |
} | |
return articles; | |
} | |
async function crawlSearchResults(page) { | |
console.log('Scraping search results'); | |
const searchResults = await page.evaluate(() => { | |
const articles = []; | |
$(".gsc-resultsbox-visible").find('a.gs-title').each(function() { | |
articles.push({ | |
title: $(this).text(), | |
url: $(this).attr('href') | |
}) | |
}); | |
return articles; | |
}); | |
console.log('Grab contents from search results'); | |
const contents = await crawlArticles(searchResults); | |
return searchResults.map((article, i) => { | |
article.content = contents[i]; | |
return article; | |
}); | |
} | |
async function hasPage(page, n) { | |
return page.evaluate((n) => { | |
const length = $(`.gsc-cursor-page[aria-label="Page ${n}"]`).length; | |
return length > 0; | |
}, n); | |
} | |
async function delay(duration) { | |
return new Promise(resolve => { | |
setTimeout(() => resolve(), duration); | |
}); | |
} | |
async function openPage(page, n) { | |
await page.evaluate((n) => { | |
$(`.gsc-cursor-page[aria-label="Page ${n}"]`).click(); | |
}, n); | |
return delay(2000); | |
} | |
(async () => { | |
const browser = await puppeteer.launch({ | |
headless: false, // The browser is visible | |
ignoreHTTPSErrors: true, | |
args: [`--window-size=${width},${height}`] // new option | |
}); | |
const page = await browser.newPage(); | |
await page.setViewport({width: width, height: height}); | |
await page.goto(`https://search.kompas.com/search/?q=${keyword}&submit=Submit+Query`); | |
let results = []; | |
let p = 1; | |
while (true) { | |
console.log("Crawling Page " + page); | |
const pageResults = await crawlSearchResults(page); | |
results = results.concat(pageResults); | |
fs.writeFileSync(`results-${keyword}.json`, JSON.stringify(results, null, 4)); | |
p++; | |
const hasNext = await hasPage(page, p); | |
if (!hasNext) { | |
break; | |
} | |
await openPage(page, p); | |
} | |
await browser.close(); | |
})(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "crawl-kompas-search", | |
"version": "1.0.0", | |
"description": "", | |
"main": "index.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"keywords": [], | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"cheerio": "^1.0.0-rc.3", | |
"puppeteer": "^1.20.0", | |
"request": "^2.88.0", | |
"request-promise": "^4.2.4" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment