Created
October 17, 2024 18:19
-
-
Save dpaluy/619eaa56ae34f93d7fc35cd884f9c614 to your computer and use it in GitHub Desktop.
Scrapping with Python and Selenium
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import random | |
| import markdownify | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| client = webdriver.Firefox() | |
| OUTPUT_DIR = 'output/' | |
| TARGET_URL = 'https://huggingface.co/posts' | |
| visited = [] | |
| all_urls = [] | |
| page_name = TARGET_URL.split('/')[-1] | |
| base_name = '/'.join(TARGET_URL.split('/')[:-1]) | |
| def scrape_shit(url): | |
| global all_urls, visited, client | |
| client.get(url) | |
| page_content = client.page_source | |
| visited.append(url) | |
| for article in BeautifulSoup(page_content, 'html.parser').find_all('article'): | |
| text_content = markdownify.markdownify(article.decode_contents()) | |
| open(OUTPUT_DIR + str(random.randint(1, 9999)) + '.txt', 'w').write(text_content) | |
| all_urls += re.findall('(?<=href=").*?(?=")', page_content) | |
| all_urls = [u for u in all_urls if u not in visited] | |
| all_urls = [(base_name + u if not 'http' in u else u) for u in all_urls if page_name in u] | |
| for target_url in all_urls: | |
| scrape_shit(target_url) | |
| scrape_shit('https://huggingface.co/posts') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment