Created
December 3, 2024 04:28
-
-
Save 0thernet/c70ddb7db6bdabb4d35f2f627c08af16 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from playwright.sync_api import sync_playwright | |
from bs4 import BeautifulSoup | |
import json | |
from urllib.parse import quote | |
def scrape_github_folders(): | |
with sync_playwright() as p: | |
browser = p.chromium.launch() | |
page = browser.new_page() | |
page.goto("https://github.com/owid/owid-datasets/tree/master/datasets") | |
# Wait for directory content to load | |
page.wait_for_selector(".react-directory-row") | |
html_content = page.content() | |
soup = BeautifulSoup(html_content, "html.parser") | |
# Find all folder rows and filter duplicates | |
folders = soup.find_all( | |
"td", class_="react-directory-row-name-cell-large-screen" | |
) | |
# Debug print | |
print(f"Found {len(folders)} folders") | |
dataset_names = [] | |
seen = set() | |
for folder in folders: | |
link = folder.find("a", class_="Link--primary") | |
if not link: | |
continue | |
name = link.text.strip() | |
if name in seen: | |
continue | |
seen.add(name) | |
dataset_names.append( | |
{ | |
"name": name, | |
"csv": f"https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/{quote(name)}/{quote(name)}.csv", | |
"readme": f"https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/{quote(name)}/README.md", | |
"datapackage": f"https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/{quote(name)}/datapackage.json", | |
} | |
) | |
# Debug print first few | |
if len(dataset_names) <= 3: | |
print(f"Added: {dataset_names[-1]}") | |
# Write to JSON file | |
print("\nFirst 3 entries before writing to JSON:") | |
for entry in dataset_names[:3]: | |
print(entry) | |
with open("datasets.json", "w", encoding="utf-8") as f: | |
json.dump(dataset_names, f, indent=2) | |
# Verify the written file | |
print("\nReading back the JSON file:") | |
with open("datasets.json", "r", encoding="utf-8") as f: | |
written_data = json.load(f) | |
print(written_data[0]) | |
browser.close() | |
if __name__ == "__main__": | |
scrape_github_folders() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment