dpaluy · October 17, 2024 18:19
diff --git a/scrape.py b/scrape.py
 import re
 import random
 import markdownify
 from bs4 import BeautifulSoup
 from selenium import webdriver

 client = webdriver.Firefox()
 OUTPUT_DIR = 'output/'
 TARGET_URL = 'https://huggingface.co/posts'

 visited = []
 all_urls = []
 page_name = TARGET_URL.split('/')[-1]
 base_name = '/'.join(TARGET_URL.split('/')[:-1])

 def scrape_shit(url):
    global all_urls, visited, client
    
    client.get(url)
    page_content = client.page_source
    visited.append(url)
    
    for article in BeautifulSoup(page_content, 'html.parser').find_all('article'):
        text_content = markdownify.markdownify(article.decode_contents())
        open(OUTPUT_DIR + str(random.randint(1, 9999)) + '.txt', 'w').write(text_content)
    
    all_urls += re.findall('(?<=href=").*?(?=")', page_content)
    all_urls = [u for u in all_urls if u not in visited]
    all_urls = [(base_name + u if not 'http' in u else u) for u in all_urls if page_name in u]
    
    for target_url in all_urls:
        scrape_shit(target_url)

 scrape_shit('https://huggingface.co/posts')
	import re
	import random
	import markdownify
	from bs4 import BeautifulSoup
	from selenium import webdriver

	client = webdriver.Firefox()
	OUTPUT_DIR = 'output/'
	TARGET_URL = 'https://huggingface.co/posts'

	visited = []
	all_urls = []
	page_name = TARGET_URL.split('/')[-1]
	base_name = '/'.join(TARGET_URL.split('/')[:-1])

	def scrape_shit(url):
	global all_urls, visited, client

	client.get(url)
	page_content = client.page_source
	visited.append(url)

	for article in BeautifulSoup(page_content, 'html.parser').find_all('article'):
	text_content = markdownify.markdownify(article.decode_contents())
	open(OUTPUT_DIR + str(random.randint(1, 9999)) + '.txt', 'w').write(text_content)

	all_urls += re.findall('(?<=href=").*?(?=")', page_content)
	all_urls = [u for u in all_urls if u not in visited]
	all_urls = [(base_name + u if not 'http' in u else u) for u in all_urls if page_name in u]

	for target_url in all_urls:
	scrape_shit(target_url)

	scrape_shit('https://huggingface.co/posts')
No results found