Skip to content

Instantly share code, notes, and snippets.

@0thernet
Created December 3, 2024 04:28
Show Gist options
  • Save 0thernet/c70ddb7db6bdabb4d35f2f627c08af16 to your computer and use it in GitHub Desktop.
Save 0thernet/c70ddb7db6bdabb4d35f2f627c08af16 to your computer and use it in GitHub Desktop.
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import json
from urllib.parse import quote
def scrape_github_folders():
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto("https://github.com/owid/owid-datasets/tree/master/datasets")
# Wait for directory content to load
page.wait_for_selector(".react-directory-row")
html_content = page.content()
soup = BeautifulSoup(html_content, "html.parser")
# Find all folder rows and filter duplicates
folders = soup.find_all(
"td", class_="react-directory-row-name-cell-large-screen"
)
# Debug print
print(f"Found {len(folders)} folders")
dataset_names = []
seen = set()
for folder in folders:
link = folder.find("a", class_="Link--primary")
if not link:
continue
name = link.text.strip()
if name in seen:
continue
seen.add(name)
dataset_names.append(
{
"name": name,
"csv": f"https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/{quote(name)}/{quote(name)}.csv",
"readme": f"https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/{quote(name)}/README.md",
"datapackage": f"https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/{quote(name)}/datapackage.json",
}
)
# Debug print first few
if len(dataset_names) <= 3:
print(f"Added: {dataset_names[-1]}")
# Write to JSON file
print("\nFirst 3 entries before writing to JSON:")
for entry in dataset_names[:3]:
print(entry)
with open("datasets.json", "w", encoding="utf-8") as f:
json.dump(dataset_names, f, indent=2)
# Verify the written file
print("\nReading back the JSON file:")
with open("datasets.json", "r", encoding="utf-8") as f:
written_data = json.load(f)
print(written_data[0])
browser.close()
if __name__ == "__main__":
scrape_github_folders()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment