Skip to content

Instantly share code, notes, and snippets.

@kyleavery
Created September 28, 2024 19:22
Show Gist options
  • Save kyleavery/a9fb335ca82da01a9dd6acd03dfe4f79 to your computer and use it in GitHub Desktop.
Save kyleavery/a9fb335ca82da01a9dd6acd03dfe4f79 to your computer and use it in GitHub Desktop.
URLs to Markdown (Jina AI)
import requests
import re
import json
import time
INFILE = "url_list.txt"
OUTFILE = "html_content.jsonl"
LOGFILE = "failed_urls.txt"
DELAY = 5
TIMEOUT = 10
JINA_API = None # Optional
def parse_markdown(content):
title_pattern = r"^Title:\s*(.*)$"
url_pattern = r"^URL Source:\s*(.*)$"
published_time_pattern = r"^Published Time:\s*(.*)$"
markdown_content_pattern = r"(?:\r?\n){2,}(.*)"
title_match = re.search(title_pattern, content, re.MULTILINE)
url_match = re.search(url_pattern, content, re.MULTILINE)
published_time_match = re.search(published_time_pattern, content, re.MULTILINE)
markdown_content_match = re.search(
markdown_content_pattern, content, re.MULTILINE | re.DOTALL
)
title = title_match.group(1) if title_match else None
url = url_match.group(1) if url_match else None
published_time_str = published_time_match.group(1) if published_time_match else None
markdown_content = (
markdown_content_match.group(1) if markdown_content_match else None
)
return {
"title": title,
"url": url,
"text": markdown_content,
}
def get_html_content(url):
api_url = f"https://r.jina.ai/{url}"
headers = {"X-Timeout": "{TIMEOUT}"}
if JINA_API:
headers["Authorization"] = f"Bearer {JINA_API}"
delay = DELAY
while True:
if delay > DELAY * 3:
print(f"[!] Unable to fetch HTML content from {api_url}. Skipping...")
return None
try:
response = requests.get(api_url, headers=headers, timeout=TIMEOUT)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"[!] Unable to fetch HTML content from {api_url}. Error: {str(e)}")
time.sleep(delay)
delay += DELAY
continue
with open(INFILE, "r") as f:
urls = f.readlines()
urls = [url.strip() for url in urls]
for url in urls:
html_content = get_html_content(url)
if html_content is None:
with open(LOGFILE, "a") as f:
f.write(url + "\n")
continue
data = parse_markdown(html_content)
with open(OUTFILE, "a") as f:
f.write(json.dumps(data) + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment