Created
April 17, 2018 02:21
-
-
Save elog08/b3e0a96cf0628bbbf9d8acdc74662ce7 to your computer and use it in GitHub Desktop.
ScrapeInfiniteList.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module.exports = function() { | |
return new Promise((resolve, reject) => { | |
// Class for Individual Thread | |
const C_THREAD = '.pagedlist_item:not(.pagedlist_hidden)'; | |
// Class for threads marked for deletion on subsequent loop | |
const C_THREAD_TO_REMOVE = '.pagedlist_item:not(.pagedlist_hidden) .TO_REMOVE'; | |
// Class for Title | |
const C_THREAD_TITLE = '.title'; | |
// Class for Description | |
const C_THREAD_DESCRIPTION = '.search_result_snippet .search_result_snippet .rendered_qtext '; | |
// Class for ID | |
const C_THREAD_ID = '.question_link'; | |
// DOM attribute for link | |
const A_THREAD_URL = 'href'; | |
// DOM attribute for ID | |
const A_THREAD_ID = 'id'; | |
const _log = console.info, | |
_warn = console.warn, | |
_error = console.error, | |
_time = console.time, | |
_timeEnd = console.timeEnd; | |
_time("Scrape"); | |
let page = 1; | |
// Global Set to store all entries | |
let threads = new Set(); // Eliminates dupes | |
// Pause between pagination | |
const PAUSE = 4000; | |
// Accepts a parent DOM element and extracts the title and URL | |
function scrapeSingleThread(elThread) { | |
try { | |
const elTitle = elThread.querySelector(C_THREAD_TITLE), | |
elLink = elThread.querySelector(C_THREAD_ID), | |
elDescription = elThread.querySelector(C_THREAD_DESCRIPTION); | |
if (elTitle) { | |
const title = elTitle.innerText.trim(), | |
description = elDescription.innerText.trim(), | |
id = elLink.getAttribute(A_THREAD_ID), | |
url = elLink.getAttribute(A_THREAD_URL); | |
threads.add({ | |
title, | |
description, | |
url, | |
id | |
}); | |
} | |
} catch (e) { | |
_error("Error capturing individual thread", e); | |
} | |
} | |
// Get all threads in the visible context | |
function scrapeThreads() { | |
_log("Scraping page %d", page); | |
const visibleThreads = document.querySelectorAll(C_THREAD); | |
if (visibleThreads.length > 0) { | |
_log("Scraping page %d... found %d threads", page, visibleThreads.length); | |
Array.from(visibleThreads).forEach(scrapeSingleThread); | |
} else { | |
_warn("Scraping page %d... found no threads", page); | |
} | |
// Return master list of threads; | |
return visibleThreads.length; | |
} | |
// Clears the list between pagination to preserve memory | |
// Otherwise, browser starts to lag after about 1000 threads | |
function clearList() { | |
_log("Clearing list page %d", page); | |
const toRemove = `${C_THREAD_TO_REMOVE}_${(page-1)}`, | |
toMark = `${C_THREAD_TO_REMOVE}_${(page)}`; | |
try { | |
// Remove threads previously marked for removal | |
document.querySelectorAll(toRemove) | |
.forEach(e => e.parentNode.removeChild(e)); | |
// // Mark visible threads for removal on next iteration | |
document.querySelectorAll(C_THREAD) | |
.forEach(e => e.className = toMark.replace(/\./g, '')); | |
} catch (e) { | |
_error("Unable to remove elements", e.message) | |
} | |
} | |
// Scrolls to the bottom of the viewport | |
function loadMore() { | |
_log("Load more... page %d", page); | |
window.scrollTo(0, document.body.scrollHeight); | |
} | |
// Recursive loop that ends when there are no more threads | |
function loop() { | |
_log("Looping... %d entries added", threads.size); | |
if (scrapeThreads()) { | |
try { | |
clearList(); | |
loadMore(); | |
page++; | |
setTimeout(loop, PAUSE) | |
} catch (e) { | |
reject(e); | |
} | |
} else { | |
_timeEnd("Scrape"); | |
resolve(Array.from(threads)); | |
} | |
} | |
loop(); | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment