Created
August 26, 2024 05:12
-
-
Save devAgam/05cc9056a6d9c5d51985f6dd88a516e1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let currentPage = 1; // Initialize the current page number to 1 | |
const lastPage = 5933; // Define the last page number to scrape, which is 5933 | |
// Listen for messages from other parts of the Chrome extension (e.g., content scripts) | |
chrome.runtime.onMessage.addListener(async (message, sender, sendResponse) => { | |
if (message.stories) { // Check if the message contains a 'stories' property (the scraped data) | |
try { | |
// Send the scraped data to an API endpoint on the local server | |
const response = await fetch("http://localhost:8000/api/stories", { | |
method: "POST", // Use the POST method to send data | |
headers: { | |
"Content-Type": "application/json", // Specify that the request body is in JSON format | |
}, | |
body: JSON.stringify({ stories: message.stories }), // Convert the stories data to a JSON string | |
}); | |
const result = await response.json(); // Parse the API response as JSON | |
console.log("Data sent to API:", result); // Log the result from the API for debugging | |
// Check if there are more pages to scrape | |
if (currentPage < lastPage) { // If the current page is less than the last page | |
currentPage++; // Increment the current page number | |
// Execute a script in the sender's tab to navigate to the next page | |
chrome.scripting.executeScript({ | |
target: { tabId: sender.tab.id }, // Specify the tab where the script should run | |
func: goToNextPage, // The function to execute (defined below) | |
args: [currentPage], // Pass the updated page number as an argument to the function | |
}); | |
} | |
} catch (error) { | |
console.error("Error sending data to API:", error); // Log any errors that occur during the API call | |
} | |
} | |
}); | |
// Function to navigate to the next page on the website | |
function goToNextPage(page) { | |
const nextPageUrl = `/page/${page}/`; // Construct the URL for the next page | |
window.location.href = nextPageUrl; // Navigate the browser to the next page | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(async function () { | |
const stories = []; | |
// Function to extract data from the current page | |
function extractStories() { | |
const storyElements = document.querySelectorAll("article"); // Replace with actual class names | |
storyElements.forEach((element) => { | |
const title = element.getElementsByTagName("h2")[0].innerText; | |
const link = element.querySelector("a").href; | |
const category = element | |
.querySelector(".meta-category") | |
.getElementsByTagName("a")[0]?.innerText; | |
// get the zero index of the array, if it exists remove the "Tags: " text and split the string into an array | |
const tags = element.querySelector(".meta-tags").innerText | |
? element | |
.querySelector(".meta-tags") | |
.innerText.replace("Tags: ", "") | |
.split(", ") | |
: []; | |
const hdate = element | |
.querySelector(".meta-date") | |
.innerText.replace("On ", ""); | |
// convert the date string to a Date object falling back to the current date if the date string is invalid | |
const date = new Date(hdate) || new Date(); | |
const excerpt = element.querySelector(".exceprt").innerText; | |
stories.push({ title, link, category, tags, date, excerpt }); | |
}); | |
// Send data to the background script | |
chrome.runtime.sendMessage({ stories }); | |
} | |
// Extract stories on the current page | |
extractStories(); | |
})(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"manifest_version": 3, | |
"name": "Web Scraper Extension", | |
"version": "1.0", | |
"permissions": ["activeTab", "scripting", "storage", "tabs"], | |
"background": { | |
"service_worker": "background.js" | |
}, | |
"content_scripts": [ | |
{ | |
"matches": ["https://www.[website you want to scrape]/*"], | |
"js": ["content.js"] | |
} | |
], | |
"host_permissions": ["https://www.[website you want to scrape]/*"] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment