Skip to content

Instantly share code, notes, and snippets.

@devAgam
Created August 26, 2024 05:14
Show Gist options
  • Save devAgam/1fa1c31a6c1849889965ede4f6436029 to your computer and use it in GitHub Desktop.
Save devAgam/1fa1c31a6c1849889965ede4f6436029 to your computer and use it in GitHub Desktop.
const MAX_CONCURRENT_TASKS = 5;
async function fetchStories() {
const response = await fetch(
"http://localhost:8000/get-where-no-story-content"
);
return response.json();
}
async function updateStoryContent(_id, content) {
const response = await fetch(
`http://localhost:8000/update-story-content/${_id}`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
sourceBody: content,
gotStoryContent: content.length < 1 ? false : true,
errorScraping: content.length < 1 ? true : false,
}),
}
);
return response.json();
}
async function scrapeContent(tabId, url) {
return new Promise((resolve, reject) => {
chrome.scripting.executeScript(
{
target: { tabId: tabId },
func: () => document.querySelector(".story-content")?.innerHTML || null,
},
(results) => {
if (chrome.runtime.lastError) {
reject(chrome.runtime.lastError);
} else if (results && results[0] && results[0].result) {
resolve(results[0].result);
} else {
reject(new Error("No story content found"));
}
}
);
});
}
async function processSingleStory(story) {
const { _id, link } = story;
return new Promise((resolve, reject) => {
chrome.tabs.create({ url: link, active: false }, async (tab) => {
let content = null;
try {
const response = await fetch(link, { redirect: "manual" });
if (response.url !== link) {
console.log(
"Redirect detected:",
link,
"redirected to",
response.url
);
content = ""; // Set content to an empty string if redirected
} else {
content = await scrapeContent(tab.id, link);
}
} catch (error) {
console.error("Error scraping content:", error);
} finally {
await updateStoryContent(_id, content);
chrome.tabs.remove(tab.id); // Close the tab after processing
resolve(); // Indicate that this story has been processed
}
});
});
}
async function processStoriesInParallel(stories) {
const tasks = stories.map((story) => processSingleStory(story));
await Promise.all(tasks); // Wait for all tasks to complete
// Check if there are more stories to process
const nextStories = await fetchStories();
if (nextStories.length > 0) {
processStoriesInParallel(nextStories); // Start the next batch of tasks
} else {
console.log("All stories processed.");
}
}
chrome.runtime.onMessage.addListener((message) => {
if (message.action === "startScraping") {
fetchStories().then(processStoriesInParallel);
}
});
// Not Needed Here
{
"manifest_version": 3,
"name": "ISS Web Scraper Extension",
"version": "1.0",
"permissions": ["activeTab", "scripting", "storage"],
"background": {
"service_worker": "background.js"
},
"host_permissions": ["https://www.[website you want to scrape]/*"],
"action": {
"default_popup": "popup.html"
}
}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Web Scraper Extension</title>
</head>
<body>
<button id="start-scraping">Start Scraping</button>
<script src="popup.js"></script>
</body>
</html>
document.getElementById("start-scraping").addEventListener("click", () => {
chrome.runtime.sendMessage({ action: "startScraping" });
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment