Skip to content

Instantly share code, notes, and snippets.

@eevmanu
Created October 25, 2021 18:47
Show Gist options
  • Select an option

  • Save eevmanu/0821df7b2f49e011162672055dfbb321 to your computer and use it in GitHub Desktop.

Select an option

Save eevmanu/0821df7b2f49e011162672055dfbb321 to your computer and use it in GitHub Desktop.
Youtube playlist stats scraper - get number of videos, total views and last time updated of a youtube playlist
#!/usr/bin/env python
import sys
import time
import json
import argparse
import locale
import datetime
import urllib.request
parser = argparse.ArgumentParser()
parser.add_argument('ytpl_url', type=str)
args = parser.parse_args()
ytpl_url = args.ytpl_url
# test first with a html file, to avoid hitting URL so frequently
# from pathlib import Path
# f = Path("/home/eevmanu/test.html")
# content = f.read_text()
with urllib.request.urlopen(ytpl_url) as u:
content = str(u.read(), 'utf-8')
pattern = '\"stats\"'
pos_start = content.find(pattern)
if pos_start == -1:
sys.exit()
if content[pos_start + len(pattern)] != ':':
sys.exit()
open_operators = ['[', '{']
close_operators = [']', '}']
if content[pos_start + len(pattern) + 1] not in open_operators:
sys.exit()
pos_start = pos_start + len(pattern) + 1
stack = []
stack.append(content[pos_start])
pos = pos_start
fail = False
wait_time = 0
while len(stack) > 0:
pos += 1
# print(f"{content[pos]=!r}")
if content[pos] in open_operators:
stack.append(content[pos])
# print(f"{stack=!r}")
time.sleep(wait_time)
continue
elif content[pos] in close_operators:
last_operator = stack.pop()
if last_operator == '[' and content[pos] == ']':
time.sleep(wait_time)
# print(f"{stack=!r}")
continue
elif last_operator == '{' and content[pos] == '}':
time.sleep(wait_time)
# print(f"{stack=!r}")
continue
else:
fail = True
# print(f"Error not compatibility between {last_operator=!r} and {content[pos]=!r}")
time.sleep(wait_time)
break
else:
# FIX ME
# if content on keys or values strings contains '[', ']', '{', '}', this logic will breaks
time.sleep(wait_time)
continue
if fail:
sys.exit()
pos_end = pos
content = content[pos_start : pos_end + 1]
print(content)
d = json.loads(content)
pl_videos_count = int(d[0]['runs'][0]['text'])
pl_views = int(d[1]['simpleText'].split(' ')[0].replace(',', ''))
pl_last_update = d[2]['runs'][1]['text']
# not recommended to change whole locale
# https://stackoverflow.com/a/32785195/3889948
# couldn't get it work with babel
# https://github.com/python-babel/babel/blob/cc36c84a83dd447bf48a6af3eb03c97bf299e8cb/babel/dates.py#L1162-L1163
# couldn't get it work with arrow get() method
# https://stackoverflow.com/q/26502470
# save initial locale to rollback later
# https://docs.python.org/3/library/locale.html?highlight=locale#:~:text=returned%20by%20localeconv().-,Example,-%3A
# $ locale -a
# es_ES.utf8
initial_locale = locale.getlocale(category=locale.LC_TIME)
locale.setlocale(locale.LC_TIME, "es_ES.utf8")
# match final dot with actual month name
# ene. | feb. | mar. | abr. | may. | jun. | jul. | ago. | set. | oct. | nov. | dic.
pl_last_update = datetime.datetime.strptime(pl_last_update, '%d %b. %Y')
pl_last_update = pl_last_update.date().isoformat()
locale.setlocale(locale.LC_TIME, initial_locale)
print(f"{pl_videos_count=!r}")
print(f"{pl_views=!r}")
print(f"{pl_last_update=!r}")
if __name__ == '__main__':
pass
# example
# minify
# [{"runs":[{"text":"7"},{"text":" videos"}]},{"simpleText":"14,340 vistas"},{"runs":[{"text":"Se actualizó por última vez el "},{"text":"11 mar. 2020"}]}]
# beautify
# [
# {
# "runs": [
# {
# "text":"7"
# },
# {
# "text":" videos"
# }
# ]
# },
# {
# "simpleText":"14,340 vistas"
# },
# {
# "runs":[
# {
# "text":"Se actualizó por última vez el "
# },
# {
# "text":"11 mar. 2020"
# }
# ]
# }
# ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment