Skip to content

Instantly share code, notes, and snippets.

@rajvermacas
Last active October 20, 2024 11:34
Show Gist options
  • Save rajvermacas/36a6cfa5406eb3b218170f0dd1119e40 to your computer and use it in GitHub Desktop.
Save rajvermacas/36a6cfa5406eb3b218170f0dd1119e40 to your computer and use it in GitHub Desktop.
stock export
const scrapeData = () => {
// 1. Find the table
const sections = document.querySelectorAll("#screener-table > table");
/*
- document.querySelectorAll finds all elements matching the CSS selector
- "#screener-table > table" means:
- Find element with ID "screener-table"
- ">" means direct child
- "table" means find table elements
- Returns a NodeList of matching tables
*/
// 2. Get all table headers
const headingElements = sections[0].querySelectorAll("th");
/*
- sections[0] gets the first table found
- querySelectorAll("th") finds all table header cells
- "th" is HTML tag for table headers
*/
// 3. Transform headers into structured data
const scrapedData = Array.from(headingElements).map(elm => {
/*
- Array.from converts NodeList to array for using map
- map processes each header element (elm)
*/
// 4. Determine column selector
const selector = elm.getAttribute("id") === "name" ?
"data-col" : elm.getAttribute("id") + "-col";
/*
- Gets header's ID attribute
- If ID is "name", use "data-col"
- Otherwise, append "-col" to the ID
- This creates CSS class selectors for finding column data
*/
// 5. Find corresponding row cells
const rowElements = sections[0]
.querySelectorAll('tbody')[0]
// Fix for error
// Error processing header element 5: SyntaxError: Failed to execute 'querySelectorAll' on 'Element': 'tbody .26wpct-col .ellipsis .desktop--only' is not a valid selector.
// at Array.map (<anonymous>)
// at scrapeData (<anonymous>:20:53)
// at <anonymous>:89:18
// Issue due to below line
// .querySelectorAll("." + selector + " .ellipsis .desktop--only")
// Fixed with below line
.querySelectorAll(`tbody [class~="${selector}"] .ellipsis .desktop--only`)
/*
- sections[0] - get first table
- querySelectorAll('tbody')[0] - get first tbody element
- querySelectorAll(". + selector + " .ellipsis .desktop--only")
finds elements with:
- class matching the selector
- class "ellipsis"
- class "desktop--only"
*/
// 6. Return structured column data
return {
// Get column name
column: elm.querySelector(".data-cell .desktop--only") ?
elm.querySelector(".data-cell .desktop--only").textContent : "#",
/*
- Looks for element with classes "data-cell" and "desktop--only"
- If found, uses its text content as column name
- If not found, uses "#" as default
*/
// Get row values
rows: Array.from(rowElements).map(el => el.textContent.replaceAll(",",""))
/*
- Convert rowElements to array
- For each element, get its text content
- Remove all commas from the text
- Creates array of cleaned cell values
*/
}
});
// 7. Return final structured data
return scrapedData;
/*
Returns array of objects, each containing:
{
column: "Column Name",
rows: ["value1", "value2", "value3", ...]
}
*/
}
const generateCSVData = scrapedData => {
scrapedData = scrapedData.slice(1, scrapedData.length - 1);
const count = scrapedData[0].rows.length;
// generate structured data
const csvData = Array.from({
length: count
}, (_, rowIndex) => {
var obj = {};
scrapedData.forEach(data => {
obj[data.column] = `"${data.rows[rowIndex]}"`
})
return obj;
});
const headers = Object.keys(csvData[0]).toString();
// Get and stringify the keys of the first object in the array
const main = csvData.map(item => Object.values(item).toString());
// Map finally returns array of arrays of values in each object
const csv = [headers, ...main].join('\n');
// Creates new array, where first row is keys and further rows the values in each object
return csv;
}
const downloadFile = csvData => {
const anchor = document.createElement('a');
anchor.href = 'data:text/csv;charset=utf-8,' + encodeURI(csvData);
anchor.target = '_blank';
anchor.download = `${document.title}`;
anchor.click();
}
downloadFile(generateCSVData(scrapeData()));
// ----------------------------------------------------------------------------- Debugger ---------------------------------------------------------------------------------
// const scrapeData = () => {
// try {
// // Log initial execution
// console.log("Starting scrapeData function");
// const sections = document.querySelectorAll("#screener-table > table");
// console.log("Found tables:", sections.length);
// if (sections.length === 0) {
// throw new Error("No table found with ID 'screener-table'");
// }
// const headingElements = sections[0].querySelectorAll("th");
// console.log("Found heading elements:", headingElements.length);
// if (headingElements.length === 0) {
// throw new Error("No header elements (th) found in table");
// }
// const scrapedData = Array.from(headingElements).map((elm, index) => {
// try {
// // Log header element details
// console.log(`\nProcessing header ${index}:`, elm);
// console.log("Header ID:", elm.getAttribute("id"));
// const selector = elm.getAttribute("id") === "name" ?
// "data-col" : elm.getAttribute("id") + "-col";
// console.log("Generated selector:", selector);
// // Validate selector before using it
// if (selector.includes(".") || selector.includes("#") || /^\d/.test(selector)) {
// console.warn(`Invalid selector detected: ${selector}`);
// // Clean the selector - remove problematic characters or handle numerics
// const cleanSelector = selector.replace(/^\d+/, 'n$&').replace(/[.#]/g, '_');
// console.log("Cleaned selector:", cleanSelector);
// }
// // Build selector parts separately for debugging
// const fullSelector = `tbody [class~="${selector}"] .ellipsis .desktop--only`;
// console.log("Full selector being used:", fullSelector);
// const rowElements = sections[0].querySelectorAll(fullSelector);
// console.log("Found row elements:", rowElements.length);
// return {
// column: (() => {
// try {
// const cellElement = elm.querySelector(".data-cell .desktop--only");
// return cellElement ? cellElement.textContent.trim() : "#";
// } catch (cellError) {
// console.error("Error getting column name:", cellError);
// return "#";
// }
// })(),
// rows: Array.from(rowElements).map((el, rowIndex) => {
// try {
// return el.textContent.replaceAll(",", "").trim();
// } catch (rowError) {
// console.error(`Error processing row ${rowIndex}:`, rowError);
// return "";
// }
// })
// };
// } catch (elementError) {
// console.error(`Error processing header element ${index}:`, elementError);
// return {
// column: `Error_Column_${index}`,
// rows: []
// };
// }
// });
// console.log("Final scraped data:", scrapedData);
// return scrapedData;
// } catch (error) {
// console.error("Error in scrapeData function:", error);
// console.error("Error stack:", error.stack);
// // Return empty array or error indicator
// return [{
// column: "Error",
// rows: [`Error scraping data: ${error.message}`]
// }];
// }
// };
// // Test the function with error catching
// try {
// const result = scrapeData();
// console.log("Function completed. Result:", result);
// } catch (error) {
// console.error("Error executing scrapeData:", error);
// }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment