Created
July 10, 2023 15:05
-
-
Save rebane2001/a2fed42c17b22c04c63cc0128440151f to your computer and use it in GitHub Desktop.
Testing various ways of reading YouTube API json from a jsonl file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import json | |
import orjson | |
import msgspec | |
class Snippet(msgspec.Struct): | |
title: str | |
class Video(msgspec.Struct): | |
id: str | |
snippet: Snippet | |
def benchmark(): | |
for method in range(3): | |
video_titles = {} | |
print(f"Benchmarking with {['json', 'orjson', 'msgspec'][method]}...") | |
start = time.time() | |
with open("2022-07-01.jsonl") as f: | |
for l in f: | |
if method < 2: | |
j = json.loads(l) if method == 0 else orjson.loads(l) | |
video_titles[j['id']] = j['snippet']['title'] | |
else: | |
j = msgspec.json.decode(l, type=Video) | |
video_titles[j.id] = j.snippet.title | |
end = time.time() | |
print(f"Took {end - start:.2f} seconds ({len(video_titles)} lines processed)") | |
for x in range(3): | |
print(f"Round {x}") | |
benchmark() | |
""" | |
Example jsonl line (test file had 568k lines): | |
{"kind": "youtube#video", "etag": "PNtiHJh3jkcT176qpKQ0rd_MsDM", "id": "9bZkp7q19f0", "snippet": {"publishedAt": "2012-07-15T07:46:32Z", "channelId": "UCrDkAvwZum-UTjHmzDI2iIw", "title": "PSY - GANGNAM STYLE(\uac15\ub0a8\uc2a4\ud0c0\uc77c) M/V", "description": "PSY - \u2018I LUV IT\u2019 M/V @ https://youtu.be/Xvjnoagk6GU\nPSY - \u2018New Face\u2019 M/V @https://youtu.be/OwJPPaEyqhI\n\nPSY - 8TH ALBUM '4X2=8' on iTunes @\nhttps://smarturl.it/PSY_8thAlbum\n\nPSY - GANGNAM STYLE(\uac15\ub0a8\uc2a4\ud0c0\uc77c) on iTunes @ http://smarturl.it/PsyGangnam\n\n#PSY #\uc2f8\uc774 #GANGNAMSTYLE #\uac15\ub0a8\uc2a4\ud0c0\uc77c\n\nMore about PSY@\nhttp://www.youtube.com/officialpsy\nhttp://www.facebook.com/officialpsy\nhttp://twitter.com/psy_oppa\nhttps://www.instagram.com/42psy42\nhttp://iTunes.com/PSY\nhttp://sptfy.com/PSY\nhttp://weibo.com/psyoppa", "thumbnails": {"default": {"url": "https://i.ytimg.com/vi/9bZkp7q19f0/default.jpg", "width": 120, "height": 90}, "medium": {"url": "https://i.ytimg.com/vi/9bZkp7q19f0/mqdefault.jpg", "width": 320, "height": 180}, "high": {"url": "https://i.ytimg.com/vi/9bZkp7q19f0/hqdefault.jpg", "width": 480, "height": 360}, "standard": {"url": "https://i.ytimg.com/vi/9bZkp7q19f0/sddefault.jpg", "width": 640, "height": 480}, "maxres": {"url": "https://i.ytimg.com/vi/9bZkp7q19f0/maxresdefault.jpg", "width": 1280, "height": 720}}, "channelTitle": "officialpsy", "tags": ["PSY", "\uc2f8\uc774", "\uac15\ub0a8\uc2a4\ud0c0\uc77c", "\ubba4\uc9c1\ube44\ub514\uc624", "Music Video", "Gangnam Style", "KOREAN SINGER", "KPOP", "KOERAN WAVE", "PSY 6\u7532", "6th Studio Album", "\uc2f8\uc7746\uc9d1", "\uc721\uac11"], "categoryId": "10", "liveBroadcastContent": "none", "localized": {"title": "PSY - GANGNAM STYLE(\uac15\ub0a8\uc2a4\ud0c0\uc77c) M/V", "description": "PSY - \u2018I LUV IT\u2019 M/V @ https://youtu.be/Xvjnoagk6GU\nPSY - \u2018New Face\u2019 M/V @https://youtu.be/OwJPPaEyqhI\n\nPSY - 8TH ALBUM '4X2=8' on iTunes @\nhttps://smarturl.it/PSY_8thAlbum\n\nPSY - GANGNAM STYLE(\uac15\ub0a8\uc2a4\ud0c0\uc77c) on iTunes @ http://smarturl.it/PsyGangnam\n\n#PSY #\uc2f8\uc774 #GANGNAMSTYLE #\uac15\ub0a8\uc2a4\ud0c0\uc77c\n\nMore about PSY@\nhttp://www.youtube.com/officialpsy\nhttp://www.facebook.com/officialpsy\nhttp://twitter.com/psy_oppa\nhttps://www.instagram.com/42psy42\nhttp://iTunes.com/PSY\nhttp://sptfy.com/PSY\nhttp://weibo.com/psyoppa"}}, "contentDetails": {"duration": "PT4M13S", "dimension": "2d", "definition": "hd", "caption": "false", "licensedContent": false, "contentRating": {}, "projection": "rectangular"}, "status": {"uploadStatus": "processed", "privacyStatus": "public", "license": "youtube", "embeddable": true, "publicStatsViewable": true, "madeForKids": false}, "statistics": {"viewCount": "4495845443", "likeCount": "25255825", "favoriteCount": "0", "commentCount": "5331801"}, "player": {"embedHtml": "<iframe width=\"480\" height=\"270\" src=\"//www.youtube.com/embed/9bZkp7q19f0\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen></iframe>"}, "topicDetails": {"topicCategories": ["https://en.wikipedia.org/wiki/Electronic_music", "https://en.wikipedia.org/wiki/Music", "https://en.wikipedia.org/wiki/Music_of_Asia", "https://en.wikipedia.org/wiki/Pop_music"]}, "recordingDetails": {}} | |
Output: | |
Round 0 | |
Benchmarking with json... | |
Took 16.67 seconds (568403 lines processed) | |
Benchmarking with orjson... | |
Took 14.27 seconds (568403 lines processed) | |
Benchmarking with msgspec... | |
Took 5.22 seconds (568403 lines processed) | |
Round 1 | |
Benchmarking with json... | |
Took 16.15 seconds (568403 lines processed) | |
Benchmarking with orjson... | |
Took 13.65 seconds (568403 lines processed) | |
Benchmarking with msgspec... | |
Took 4.72 seconds (568403 lines processed) | |
Round 2 | |
Benchmarking with json... | |
Took 14.51 seconds (568403 lines processed) | |
Benchmarking with orjson... | |
Took 12.79 seconds (568403 lines processed) | |
Benchmarking with msgspec... | |
Took 4.93 seconds (568403 lines processed) | |
[Finished in 103.5s] | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment