Created
October 25, 2021 18:47
-
-
Save eevmanu/0821df7b2f49e011162672055dfbb321 to your computer and use it in GitHub Desktop.
Youtube playlist stats scraper - get number of videos, total views and last time updated of a youtube playlist
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import sys | |
| import time | |
| import json | |
| import argparse | |
| import locale | |
| import datetime | |
| import urllib.request | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('ytpl_url', type=str) | |
| args = parser.parse_args() | |
| ytpl_url = args.ytpl_url | |
| # test first with a html file, to avoid hitting URL so frequently | |
| # from pathlib import Path | |
| # f = Path("/home/eevmanu/test.html") | |
| # content = f.read_text() | |
| with urllib.request.urlopen(ytpl_url) as u: | |
| content = str(u.read(), 'utf-8') | |
| pattern = '\"stats\"' | |
| pos_start = content.find(pattern) | |
| if pos_start == -1: | |
| sys.exit() | |
| if content[pos_start + len(pattern)] != ':': | |
| sys.exit() | |
| open_operators = ['[', '{'] | |
| close_operators = [']', '}'] | |
| if content[pos_start + len(pattern) + 1] not in open_operators: | |
| sys.exit() | |
| pos_start = pos_start + len(pattern) + 1 | |
| stack = [] | |
| stack.append(content[pos_start]) | |
| pos = pos_start | |
| fail = False | |
| wait_time = 0 | |
| while len(stack) > 0: | |
| pos += 1 | |
| # print(f"{content[pos]=!r}") | |
| if content[pos] in open_operators: | |
| stack.append(content[pos]) | |
| # print(f"{stack=!r}") | |
| time.sleep(wait_time) | |
| continue | |
| elif content[pos] in close_operators: | |
| last_operator = stack.pop() | |
| if last_operator == '[' and content[pos] == ']': | |
| time.sleep(wait_time) | |
| # print(f"{stack=!r}") | |
| continue | |
| elif last_operator == '{' and content[pos] == '}': | |
| time.sleep(wait_time) | |
| # print(f"{stack=!r}") | |
| continue | |
| else: | |
| fail = True | |
| # print(f"Error not compatibility between {last_operator=!r} and {content[pos]=!r}") | |
| time.sleep(wait_time) | |
| break | |
| else: | |
| # FIX ME | |
| # if content on keys or values strings contains '[', ']', '{', '}', this logic will breaks | |
| time.sleep(wait_time) | |
| continue | |
| if fail: | |
| sys.exit() | |
| pos_end = pos | |
| content = content[pos_start : pos_end + 1] | |
| print(content) | |
| d = json.loads(content) | |
| pl_videos_count = int(d[0]['runs'][0]['text']) | |
| pl_views = int(d[1]['simpleText'].split(' ')[0].replace(',', '')) | |
| pl_last_update = d[2]['runs'][1]['text'] | |
| # not recommended to change whole locale | |
| # https://stackoverflow.com/a/32785195/3889948 | |
| # couldn't get it work with babel | |
| # https://github.com/python-babel/babel/blob/cc36c84a83dd447bf48a6af3eb03c97bf299e8cb/babel/dates.py#L1162-L1163 | |
| # couldn't get it work with arrow get() method | |
| # https://stackoverflow.com/q/26502470 | |
| # save initial locale to rollback later | |
| # https://docs.python.org/3/library/locale.html?highlight=locale#:~:text=returned%20by%20localeconv().-,Example,-%3A | |
| # $ locale -a | |
| # es_ES.utf8 | |
| initial_locale = locale.getlocale(category=locale.LC_TIME) | |
| locale.setlocale(locale.LC_TIME, "es_ES.utf8") | |
| # match final dot with actual month name | |
| # ene. | feb. | mar. | abr. | may. | jun. | jul. | ago. | set. | oct. | nov. | dic. | |
| pl_last_update = datetime.datetime.strptime(pl_last_update, '%d %b. %Y') | |
| pl_last_update = pl_last_update.date().isoformat() | |
| locale.setlocale(locale.LC_TIME, initial_locale) | |
| print(f"{pl_videos_count=!r}") | |
| print(f"{pl_views=!r}") | |
| print(f"{pl_last_update=!r}") | |
| if __name__ == '__main__': | |
| pass | |
| # example | |
| # minify | |
| # [{"runs":[{"text":"7"},{"text":" videos"}]},{"simpleText":"14,340 vistas"},{"runs":[{"text":"Se actualizó por última vez el "},{"text":"11 mar. 2020"}]}] | |
| # beautify | |
| # [ | |
| # { | |
| # "runs": [ | |
| # { | |
| # "text":"7" | |
| # }, | |
| # { | |
| # "text":" videos" | |
| # } | |
| # ] | |
| # }, | |
| # { | |
| # "simpleText":"14,340 vistas" | |
| # }, | |
| # { | |
| # "runs":[ | |
| # { | |
| # "text":"Se actualizó por última vez el " | |
| # }, | |
| # { | |
| # "text":"11 mar. 2020" | |
| # } | |
| # ] | |
| # } | |
| # ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment