Skip to content

Instantly share code, notes, and snippets.

@tesnos6921
Created June 4, 2019 22:26
Show Gist options
  • Save tesnos6921/e97d7cd84de49dcf33605a1b9526f0ba to your computer and use it in GitHub Desktop.
Save tesnos6921/e97d7cd84de49dcf33605a1b9526f0ba to your computer and use it in GitHub Desktop.
py2, requires requests. put both files in the same directory, follow cookie instructions from https://github.com/patrickyeon/youtube-history, and run archiver.sh
while true; do
echo "Getting History..."
python2 ./ythistory.py -j cookiefile --since since.txt > videolist.txt
echo "Downloading Videos..."
while read line; do
youtube-dl --write-all-thumbnails --all-subs --write-description --write-annotations --write-info-json -o "%(uploader)s/%(title)s/%(id)s.%(ext)s" $line
done < videolist.txt
echo "Updating Cache..."
SINCE=$(awk 'NF{p=$0}END{print p}' rawfile.txt)
if [ -n "$SINCE" ]
then
rm since.txt
echo $SINCE > since.txt
fi
echo "Archive Updated! Waiting..."
sleep 1h
done
#!/usr/bin/python
#based on https://github.com/patrickyeon/youtube-history, which is based on youtube-dl
#py2, requires requests
import argparse
import cookielib
import datetime
import re
import requests
import time
id_exp = 'href="\s*/watch\?v=([0-9A-Za-z_-]{11})'
more_exp = 'data-uix-load-more-href="/?(?P<more>[^"]+)"'
class mincookie(object):
# ugly and I don't care
def __init__(self, name, value):
self.domain = '.youtube.com'
self.path = '/'
self.secure = False
self.expires = int(time.time()) + 7*24*60*60
self.is_expired = lambda t: False
self.port = None
self.version = 0
self.name = name
self.value = value
self.discard = False
def extract(response):
try:
data = response.json()
ids = re.findall(id_exp, data['content_html'])
mobj = re.search(more_exp, data['load_more_widget_html'])
except ValueError:
ids = re.findall(id_exp, response.text)
mobj = re.search(more_exp, response.text)
if mobj is not None:
mobj = 'https://youtube.com/{}'.format(mobj.group('more'))
ids_even = [ids[i] for i in range(0, len(ids), 2)]
ids_odd = [ids[i] for i in range(1, len(ids), 2)]
# As of 2018-11-27, each video id is listed twice. Bail out if that changes
assert ids_even == ids_odd
return ids_even, mobj
def find_overlap(haystack, needle):
for i in range(len(haystack) - len(needle) + 1):
if haystack[i : i + len(needle)] == needle:
return i
return -1
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-j', '--cookie-jar',
help='cookie file (Netscape format cookies.txt')
parser.add_argument('-c', '--cookies',
help='cookies ("key1=val1; key2=val2")')
parser.add_argument('--max', type=int, help='maximum count to list')
parser.add_argument('--since',
help='file of comma-delimited list of ids, stop when reached')
args = parser.parse_args()
if args.cookie_jar:
cookies = cookielib.MozillaCookieJar(args.cookie_jar)
try:
cookies.load()
except IOError:
# if it doesn't exist, that's ok. probably just want to write to it
pass
else:
cookies = cookielib.CookieJar()
if args.cookies:
for kv in args.cookies.split('; '):
k,v = kv.split('=', 1)
cookies.set_cookie(mincookie(k, v))
ids = []
url = 'https://youtube.com/feed/history'
sess = requests.Session()
sincelist = ""
#print args.since + " yoyo"
if args.since:
sincefile = open(args.since, "rb+")
sincelist = sincefile.read().strip()
#print sincelist + "heyhey"
sincefile.close()
while (args.max < 0 or len(ids) < args.max) and url is not None:
resp = sess.get(url, cookies=cookies)
resp.raise_for_status()
newids, url = extract(resp)
ids.extend(newids)
if sincelist != "":
idx = find_overlap(ids, sincelist.split(','))
if idx >= 0:
ids = ids[:idx]
break
# put everything in reverse-chronological order
ids.reverse()
rawfile = open("rawfile.txt", "wb+")
for id in ids:
print "https://youtube.com/watch?v=" + id
rawfile.write(id + "\n")
rawfile.close()
# print '# {}'.format(datetime.datetime.now())
# print '# Adding {} video IDs'.format(len(ids))
# print 'https://youtube.com/watch?v='.join(ids) + "\n"
if args.cookie_jar:
cookies.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment