Created
December 19, 2024 10:30
-
-
Save SimpleCookie/3f24f0f221d25dfffa5d381d4da84c0b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import puppeteer, { Page } from "puppeteer" | |
// import { analyzeWithChatGPT } from "./analyser" | |
import { saveToFile } from "./fileHandler" | |
const baseUrl = "https://www.pathofexile.com" | |
const startUrl = `${baseUrl}/forum/view-thread/3645467` | |
// Utility functions | |
const cleanContent = (content: string): string => | |
content | |
.replace(/\n/g, "") // Remove breaklines | |
.replace(/\[quote.*?\].*?\[\/quote\]/gi, "") // Remove quotes | |
.trim() // Remove extra spaces | |
const fetchHtml = async (url: string, page: Page): Promise<Page> => { | |
await page.goto(url, { waitUntil: "domcontentloaded" }) | |
return page | |
} | |
const extractContent = async ( | |
page: Page, | |
isFirstPage: boolean | |
): Promise<string[]> => { | |
const rows = await page.evaluate((isFirstPage) => { | |
const allRows = Array.from(document.querySelectorAll("table.forumTable tr")) | |
const filteredRows = isFirstPage ? allRows.slice(1) : allRows | |
return filteredRows.map( | |
(row) => row.querySelector("div.content")?.textContent || "" | |
) | |
}, isFirstPage) | |
return rows | |
.filter((content) => content.length > 0) // Remove empty rows | |
.map(cleanContent) // Clean the content | |
} | |
const getNextPageUrl = async (page: Page): Promise<string | null> => { | |
return await page.evaluate(() => { | |
// Get all links in pagination | |
const paginationLinks = Array.from( | |
document.querySelectorAll("div.pagination a") | |
) | |
// Find the "Next" button | |
const nextButton = paginationLinks.find( | |
(link) => link.textContent?.trim() === "Next" | |
) | |
return nextButton ? (nextButton as HTMLAnchorElement).href : null | |
}) | |
} | |
const crawlPages = async ( | |
url: string | null, | |
page: Page, | |
contentList: string[] = [], | |
isFirstPage = true | |
): Promise<string[]> => { | |
if (!url) return contentList | |
await fetchHtml(url, page) | |
const pageContent = await extractContent(page, isFirstPage) | |
const nextPageUrl = await getNextPageUrl(page) | |
// Recursively process the next page | |
return await crawlPages( | |
nextPageUrl, | |
page, | |
contentList.concat(pageContent), | |
false | |
) | |
} | |
const startCrawling = async (): Promise<void> => { | |
const browser = await puppeteer.launch({ headless: true }) | |
const page = await browser.newPage() | |
try { | |
const contentList = await crawlPages(startUrl, page) | |
console.log("Collected content:", contentList) | |
// Save crawled content to a file | |
const fileName = "crawled_content.txt" | |
await saveToFile(fileName, contentList) | |
// const summary = await analyzeWithChatGPT(contentList) | |
// console.log("Weighted Summary:", summary) | |
} catch (error) { | |
console.error("Error during crawling:", error) | |
} finally { | |
await browser.close() | |
} | |
} | |
// Start the crawler | |
startCrawling() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment