Created
April 24, 2023 05:51
-
-
Save jazzyjackson/66ff45fb32224b60c82ecee558cf6d15 to your computer and use it in GitHub Desktop.
scrape chatorg.ai
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MAIN FUNCTION | |
// This function will scrape all the conversations from the chatorg website | |
// It will return an array of arrays, the outer array is of all conversation | |
// the inner array is a list of message pairs | |
// grab all anchors | |
// advance to the next anchor | |
// pull the current chat history | |
async function main(min = 0){ | |
// click all the empty chakra links | |
let anchors = await clickEmptyChakraLinks() | |
conversations = [] | |
// this is going to be a async for each of the anchors | |
// for each anchor, click it, wait for the page to load, then scrape the conversation history | |
for (let i = min; i < anchors.length; i++) { | |
// click the anchor | |
anchors[i].click() | |
let title = anchors[i].innerText | |
// wait for the page to load | |
// let container = await loadHistory(link, chatContainer) | |
await wait(3500) | |
// for each child of the container, get the name from aria-label of the avatar and the content from the div.chat-message-wrapper | |
// then push it to the conversations array | |
let conversation = {title, messages: []} | |
let container = document.querySelector('#chatlog') | |
for(let j = 0; j < container.children.length - 1; j++){ // length - 1 because last message isn't real | |
let child = container.children[j] | |
try { | |
console.log("querying ", j) | |
let name = child.querySelector('.chakra-avatar div').getAttribute('aria-label') | |
let html = child.querySelector('.chat-message-wrapper').innerHTML | |
conversation.messages.push({name, html}) | |
} catch(e){ | |
console.log("Failed to get name or html", j) | |
console.warn(e) | |
} | |
} | |
conversations[i] = conversation | |
console.log("PUSHED CONVERSATION", title) | |
} | |
// download the conversation history | |
downloadStringAsFile(JSON.stringify(conversations), 'conversationHistory.json') | |
} | |
// create a wait function to just throw some delay into the crawl | |
async function wait(ms){ | |
return new Promise(resolve => setTimeout(resolve, ms)) | |
} | |
// Function definitions BEGIN | |
// first grab all the chakra links, filter by a tags with no href | |
// perform an asynchronous while loop to click the empty chakra-links recursively until no more exist | |
// this is a recursive function that will click all the empty chakra-links | |
function clickEmptyChakraLinks() { | |
return new Promise(resolve => { | |
// grab all the chakra links, filter by a tags with no href | |
const emptyChakraLinks = Array.from(document.querySelectorAll('a.chakra-link')).filter(a => !a.href) | |
// if there are no more empty chakra links, resolve the promise | |
if (emptyChakraLinks.length === 0) { | |
resolve(document.querySelectorAll('a.chakra-link[href^="/chat"]')) | |
} else { | |
// click the first empty chakra link | |
emptyChakraLinks.forEach(a => a.click()) | |
// wait for the page to load | |
setTimeout(() => { | |
// recursively call the function | |
clickEmptyChakraLinks().then(resolve) | |
}, 100) | |
} | |
}) | |
} | |
// clicking on each link will take a moment to load the next conversation | |
// so we need to wait for the page to load before clicking the next link | |
// lets watch the #chatlog element for changes | |
// use a mutation observer on the document to detect when the chat history is loaded | |
// So I need to detect when a new chatlog is loaded | |
// what happens when a history is clicked, is that the chatlog is first removed, and then later reappended | |
// so I need to detect when the chatlog is removed, and then when it is reappended | |
// so for every mutation, set a flag once there is no longer a chatlog, and then next time chatlog does exist, resolve the promise with the chatlog | |
function loadHistory(element, container){ | |
console.log("LOADING HISTORY", element.innerText) | |
let chatlogRemoved = false | |
element.click() | |
// within the mutation observer, resolve once #chatlog exists & the removed flag had been set | |
return new Promise((resolve) => { | |
const observer = new MutationObserver(() => { | |
// if the chat history is removed | |
if (!container.querySelector('#chatlog')) { | |
// set the removed flag | |
chatlogRemoved = true | |
console.log('!!! chatlog removed') | |
} else if (chatlogRemoved) { | |
// stop observing the document | |
observer.disconnect() | |
console.log('!!! resolving') | |
// resolve the promise with the chatlog | |
resolve(container.querySelector('#chatlog')) | |
} | |
}) | |
// start observing the document | |
observer.observe(container, { childList: true, subtree: true }) | |
}) | |
} | |
function downloadStringAsFile(str, filename = 'download.txt') { | |
// Create a hidden button element | |
const btn = document.createElement('button'); | |
btn.style.display = 'none'; | |
document.body.appendChild(btn); | |
// Create a Blob object from the input string | |
const blob = new Blob([str], { type: 'text/plain' }); | |
const url = URL.createObjectURL(blob); | |
// Create an anchor element with download attribute | |
const link = document.createElement('a'); | |
link.href = url; | |
link.download = filename; | |
// Add the anchor element to the button | |
btn.appendChild(link); | |
// Add a click event listener to the button to trigger the file download | |
btn.addEventListener('click', (event) => { | |
link.click(); | |
btn.remove() | |
}); | |
// Click the button programmatically | |
btn.click(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment