Skip to content

Instantly share code, notes, and snippets.

@dpaluy
Created October 17, 2024 18:19
Show Gist options
  • Select an option

  • Save dpaluy/619eaa56ae34f93d7fc35cd884f9c614 to your computer and use it in GitHub Desktop.

Select an option

Save dpaluy/619eaa56ae34f93d7fc35cd884f9c614 to your computer and use it in GitHub Desktop.
Scrapping with Python and Selenium
import re
import random
import markdownify
from bs4 import BeautifulSoup
from selenium import webdriver
client = webdriver.Firefox()
OUTPUT_DIR = 'output/'
TARGET_URL = 'https://huggingface.co/posts'
visited = []
all_urls = []
page_name = TARGET_URL.split('/')[-1]
base_name = '/'.join(TARGET_URL.split('/')[:-1])
def scrape_shit(url):
global all_urls, visited, client
client.get(url)
page_content = client.page_source
visited.append(url)
for article in BeautifulSoup(page_content, 'html.parser').find_all('article'):
text_content = markdownify.markdownify(article.decode_contents())
open(OUTPUT_DIR + str(random.randint(1, 9999)) + '.txt', 'w').write(text_content)
all_urls += re.findall('(?<=href=").*?(?=")', page_content)
all_urls = [u for u in all_urls if u not in visited]
all_urls = [(base_name + u if not 'http' in u else u) for u in all_urls if page_name in u]
for target_url in all_urls:
scrape_shit(target_url)
scrape_shit('https://huggingface.co/posts')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment