Skip to content

Instantly share code, notes, and snippets.

@devAgam
Created August 26, 2024 05:12
Show Gist options
  • Save devAgam/05cc9056a6d9c5d51985f6dd88a516e1 to your computer and use it in GitHub Desktop.
Save devAgam/05cc9056a6d9c5d51985f6dd88a516e1 to your computer and use it in GitHub Desktop.
let currentPage = 1; // Initialize the current page number to 1
const lastPage = 5933; // Define the last page number to scrape, which is 5933
// Listen for messages from other parts of the Chrome extension (e.g., content scripts)
chrome.runtime.onMessage.addListener(async (message, sender, sendResponse) => {
if (message.stories) { // Check if the message contains a 'stories' property (the scraped data)
try {
// Send the scraped data to an API endpoint on the local server
const response = await fetch("http://localhost:8000/api/stories", {
method: "POST", // Use the POST method to send data
headers: {
"Content-Type": "application/json", // Specify that the request body is in JSON format
},
body: JSON.stringify({ stories: message.stories }), // Convert the stories data to a JSON string
});
const result = await response.json(); // Parse the API response as JSON
console.log("Data sent to API:", result); // Log the result from the API for debugging
// Check if there are more pages to scrape
if (currentPage < lastPage) { // If the current page is less than the last page
currentPage++; // Increment the current page number
// Execute a script in the sender's tab to navigate to the next page
chrome.scripting.executeScript({
target: { tabId: sender.tab.id }, // Specify the tab where the script should run
func: goToNextPage, // The function to execute (defined below)
args: [currentPage], // Pass the updated page number as an argument to the function
});
}
} catch (error) {
console.error("Error sending data to API:", error); // Log any errors that occur during the API call
}
}
});
// Function to navigate to the next page on the website
function goToNextPage(page) {
const nextPageUrl = `/page/${page}/`; // Construct the URL for the next page
window.location.href = nextPageUrl; // Navigate the browser to the next page
}
(async function () {
const stories = [];
// Function to extract data from the current page
function extractStories() {
const storyElements = document.querySelectorAll("article"); // Replace with actual class names
storyElements.forEach((element) => {
const title = element.getElementsByTagName("h2")[0].innerText;
const link = element.querySelector("a").href;
const category = element
.querySelector(".meta-category")
.getElementsByTagName("a")[0]?.innerText;
// get the zero index of the array, if it exists remove the "Tags: " text and split the string into an array
const tags = element.querySelector(".meta-tags").innerText
? element
.querySelector(".meta-tags")
.innerText.replace("Tags: ", "")
.split(", ")
: [];
const hdate = element
.querySelector(".meta-date")
.innerText.replace("On ", "");
// convert the date string to a Date object falling back to the current date if the date string is invalid
const date = new Date(hdate) || new Date();
const excerpt = element.querySelector(".exceprt").innerText;
stories.push({ title, link, category, tags, date, excerpt });
});
// Send data to the background script
chrome.runtime.sendMessage({ stories });
}
// Extract stories on the current page
extractStories();
})();
{
"manifest_version": 3,
"name": "Web Scraper Extension",
"version": "1.0",
"permissions": ["activeTab", "scripting", "storage", "tabs"],
"background": {
"service_worker": "background.js"
},
"content_scripts": [
{
"matches": ["https://www.[website you want to scrape]/*"],
"js": ["content.js"]
}
],
"host_permissions": ["https://www.[website you want to scrape]/*"]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment