kyleavery · September 28, 2024 19:22
diff --git a/scrape.py b/scrape.py
 import requests
 import re
 import json
 import time


 INFILE = "url_list.txt"
 OUTFILE = "html_content.jsonl"
 LOGFILE = "failed_urls.txt"
 DELAY = 5
 TIMEOUT = 10
 JINA_API = None  # Optional


 def parse_markdown(content):
    title_pattern = r"^Title:\s*(.*)$"
    url_pattern = r"^URL Source:\s*(.*)$"
    published_time_pattern = r"^Published Time:\s*(.*)$"
    markdown_content_pattern = r"(?:\r?\n){2,}(.*)"

    title_match = re.search(title_pattern, content, re.MULTILINE)
    url_match = re.search(url_pattern, content, re.MULTILINE)
    published_time_match = re.search(published_time_pattern, content, re.MULTILINE)
    markdown_content_match = re.search(
        markdown_content_pattern, content, re.MULTILINE | re.DOTALL
    )

    title = title_match.group(1) if title_match else None
    url = url_match.group(1) if url_match else None
    published_time_str = published_time_match.group(1) if published_time_match else None
    markdown_content = (
        markdown_content_match.group(1) if markdown_content_match else None
    )

    return {
        "title": title,
        "url": url,
        "text": markdown_content,
    }


 def get_html_content(url):
    api_url = f"https://r.jina.ai/{url}"
    headers = {"X-Timeout": "{TIMEOUT}"}
    if JINA_API:
        headers["Authorization"] = f"Bearer {JINA_API}"

    delay = DELAY
    while True:
        if delay > DELAY * 3:
            print(f"[!] Unable to fetch HTML content from {api_url}. Skipping...")
            return None

        try:
            response = requests.get(api_url, headers=headers, timeout=TIMEOUT)
            response.raise_for_status()
            return response.text

        except requests.exceptions.RequestException as e:
            print(f"[!] Unable to fetch HTML content from {api_url}. Error: {str(e)}")
            time.sleep(delay)
            delay += DELAY
            continue


 with open(INFILE, "r") as f:
    urls = f.readlines()

 urls = [url.strip() for url in urls]

 for url in urls:
    html_content = get_html_content(url)
    if html_content is None:
        with open(LOGFILE, "a") as f:
            f.write(url + "\n")
        continue

    data = parse_markdown(html_content)

    with open(OUTFILE, "a") as f:
        f.write(json.dumps(data) + "\n")
	import requests
	import re
	import json
	import time


	INFILE = "url_list.txt"
	OUTFILE = "html_content.jsonl"
	LOGFILE = "failed_urls.txt"
	DELAY = 5
	TIMEOUT = 10
	JINA_API = None # Optional


	def parse_markdown(content):
	title_pattern = r"^Title:\s(.)$"
	url_pattern = r"^URL Source:\s(.)$"
	published_time_pattern = r"^Published Time:\s(.)$"
	markdown_content_pattern = r"(?:\r?\n){2,}(.*)"

	title_match = re.search(title_pattern, content, re.MULTILINE)
	url_match = re.search(url_pattern, content, re.MULTILINE)
	published_time_match = re.search(published_time_pattern, content, re.MULTILINE)
	markdown_content_match = re.search(
	markdown_content_pattern, content, re.MULTILINE \| re.DOTALL
	)

	title = title_match.group(1) if title_match else None
	url = url_match.group(1) if url_match else None
	published_time_str = published_time_match.group(1) if published_time_match else None
	markdown_content = (
	markdown_content_match.group(1) if markdown_content_match else None
	)

	return {
	"title": title,
	"url": url,
	"text": markdown_content,
	}


	def get_html_content(url):
	api_url = f"https://r.jina.ai/{url}"
	headers = {"X-Timeout": "{TIMEOUT}"}
	if JINA_API:
	headers["Authorization"] = f"Bearer {JINA_API}"

	delay = DELAY
	while True:
	if delay > DELAY * 3:
	print(f"[!] Unable to fetch HTML content from {api_url}. Skipping...")
	return None

	try:
	response = requests.get(api_url, headers=headers, timeout=TIMEOUT)
	response.raise_for_status()
	return response.text

	except requests.exceptions.RequestException as e:
	print(f"[!] Unable to fetch HTML content from {api_url}. Error: {str(e)}")
	time.sleep(delay)
	delay += DELAY
	continue


	with open(INFILE, "r") as f:
	urls = f.readlines()

	urls = [url.strip() for url in urls]

	for url in urls:
	html_content = get_html_content(url)
	if html_content is None:
	with open(LOGFILE, "a") as f:
	f.write(url + "\n")
	continue

	data = parse_markdown(html_content)

	with open(OUTFILE, "a") as f:
	f.write(json.dumps(data) + "\n")