Last active
April 8, 2025 13:24
-
-
Save JosephRedfern/d60bdc584d84b1451cc6052e955b755c to your computer and use it in GitHub Desktop.
Scrapes the youtube video IDs for the youtube-8m data set. Probably buggy. Could be threaded.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from collections import defaultdict | |
csv_prefix = "https://research.google.com/youtube8m/csv" | |
r = requests.get("{0}/verticals.json".format(csv_prefix)) | |
verticals = r.json() | |
block_urls = defaultdict(list) | |
count = 0 | |
for cat, urls in verticals.items(): | |
for url in urls: | |
jsurl = "{0}/j/{1}.js".format(csv_prefix, url.split("/")[-1]) | |
block_urls[cat[1:]].append(jsurl) | |
count += 1 #lazy. | |
ids_by_cat = defaultdict(list) | |
downloaded = 0.0 | |
for cat_name, block_file_urls in block_urls.items(): | |
for block_file_url in block_file_urls: | |
print("[{0}%] Downloading block file: {1} {2}".format((100.0*downloaded/count), block_file_url, cat_name)) | |
try: | |
r = requests.get(block_file_url) | |
idlist = r.content.split("\"")[3] | |
ids = [n for n in idlist.split(";") if len(n) > 3] | |
ids_by_cat[cat_name] += ids | |
except IndexError, IOError: | |
print("Failed to download or process block at {0}".format(block_file_url)) | |
downloaded += 1 #increment even if we've failed. | |
with open("{0}.txt".format(cat_name), "w") as idfile: | |
print("Writing ids to {0}.txt".format(cat_name)) | |
for vid in ids_by_cat[cat_name]: | |
idfile.write("{0}\n".format(vid)) | |
print("Done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Glad I could help!