Created
February 8, 2025 15:56
-
-
Save alexcg1/ef9a50ffe9a34ce717afb26c36118ef3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import argparse | |
| def scrape_urls(source_url, output_file): | |
| response = requests.get(source_url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| story_list = soup.find_all(class_="story_list") | |
| urls = [] | |
| for story in story_list: | |
| links = story.find_all("a", href=True) | |
| for link in links: | |
| urls.append(link["href"]) | |
| with open(output_file, "w") as f: | |
| for url in urls: | |
| f.write(url + "\n") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Scrape URLs from a webpage") | |
| parser.add_argument("source_url", help="URL of the webpage to scrape") | |
| parser.add_argument("output_file", help="File to store the extracted URLs") | |
| args = parser.parse_args() | |
| scrape_urls(args.source_url, args.output_file) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment