stefansundin · April 16, 2016 01:42
diff --git a/guess-ytlive-url.py b/guess-ytlive-url.py
 # https://gist.github.com/stefansundin/6261442
 # Python 3.x
 # Binary search for the correct clen parameter for YouTube live streams, which is useful when a segment is no longer listed in the playlist file (check EXT-X-EARLIEST-MEDIA-SEQUENCE).
 # It's quick and dirty. clen is always a multiple of 188 (since it is TS). lmt and dur does not have any impact it seems.
 # Usually finds the correct url in about 15 tries.
 # See http://stefansundin.com/blog/452#comments
 # TODO: add threading to process multiple sequence numbers at the same time.

 """
 Regexps to extract clen and dur from a playlist:

 Search: #.+\n
 Replace with empty string

 Search : .+clen=(\d+).+
 Replace: \1

 Search : .+clen=(\d+).+dur=(\d).+
 Replace: \1\t\2


 Some dur variations:

 dur=8.333: 30 fps, 249 frames, 396800 audio samples
 same for dur=8.334

 dur=6.007: 30 fps, 179 frames, 284800 audio samples
 dur=3.003: 30 fps,  89 frames, 140800 audio samples

 dur=6: 25 fps, 148 frames, 284160 audio samples
 dur=4: 25 fps,  73 frames, 140160 audio samples

 audio is always 48000 Hz
 """

 import re, urllib.request, sys, time

 def verb(text):
 	if verbose:
 		print(text)


 # I get '403 Forbidden' for some streams, but adding a cookie fixes that. Happens randomly.

 opener = urllib.request.build_opener()
 opener.addheaders = [("Cookie", "VISITOR_INFO1_LIVE=ktfPrjH8eoc;")]


 url = "http://www.youtube.com/videoplayback/id/cE00Me8FOo0.1/itag/94/source/yt_live_broadcast/sq/4760/file/seg.ts?ratebypass=yes&cmbypass=yes&newshard=yes&hls_chunk_host=www.youtube.com&gir=yes&dg_shard=cE00Me8FOo0.1_94&playlist_type=DVR&maudio=1&pmbypass=yes&cp=U0hWS1hQT19HUENONl9QSlNKOm1xZVN0bkk3am1l&upn=aaTidjy5_gs&fexp=900064,910825,906000,909546,906397,929117,929121,929906,929907,929922,929127,929129,929131,929930,925720,925722,925718,925714,929917,929919,929933,912521,932306,913428,904830,919373,930803,908536,904122,938701,911423,909549,900816,912711,935802,904494&sver=3&cpn=uD8wf5_9faz8nSDl&ip=130.240.207.250&ipbits=8&expire=1376851926&sparams=ip,ipbits,expire,id,itag,source,ratebypass,live,cmbypass,newshard,hls_chunk_host,gir,dg_shard,playlist_type,maudio,pmbypass,cp&signature=4DF70AF061A666E47201AA68C65CD1313CEEE54C.979D6A004FE7B53E28C2E1FF271ACFE86ED02FEC&key=dg_yt0&live=1&lmt=1376829143883697&clen=647848&dur=6.007"

 start = 1100
 num = 5



 # Note: you can't change the itag value since the signature will be incorrect, get a new url

 itag = re.search("itag/(\d+)/", url)
 if itag == None:
 	print("Warning: Could not find itag value (quality)")
 else:
 	itag = itag.groups()[0]
 	itags = {"92":"240p", "93":"360p", "94":"480p", "95":"720p", "96":"1080p"}
 	if itag in itags:
 		print("Note: itag="+itag+" ("+itags[itag]+")")
 	else:
 		print("Warning: Unknown itag="+itag)


 for segid in range(start,start+num):
 	url = re.sub("/sq/\d+/", "/sq/"+str(segid)+"/", url)


 	# will be multiplied with 188
 	guess_min = 10
 	guess_max = 22310 # for full HD this is can be about 4 MB
 	verbose = True


 	# sanity checks
 	# these checks do not always work for some reason (especially in HD)
 	"""
 	clen = 188*guess_min
 	verb("sanity check, guess clen="+str(clen))
 	url = re.sub("clen=\d+", "clen="+str(clen), url)
 	f = opener.open(url)
 	data = f.read(100)
 	f.close()
 	if len(data) == 0:
 		print("segid="+str(segid)+": clen="+str(clen)+" did not work, this segment is probably no longer accessible.")
 		sys.exit(1)

 	clen = 188*guess_max
 	verb("sanity check, guess clen="+str(clen))
 	url = re.sub("clen=\d+", "clen="+str(clen), url)
 	f = opener.open(url)
 	data = f.read(100)
 	f.close()
 	if len(data) != 0:
 		print("segid="+str(segid)+": clen="+str(clen)+" worked, please increase guess_max.")
 		sys.exit(1)
 	"""


 	chunks = None
 	for i in range(100):
 		verb("guess_min: "+str(guess_min))
 		verb("guess_max: "+str(guess_max))

 		if guess_min+1 == guess_max:
 			clen = 188*guess_min
 			break

 		clen = 188*int(guess_min+(guess_max-guess_min)/2)
 		verb("guess clen="+str(clen))

 		url = re.sub("clen=\d+", "clen="+str(clen), url)
 		print(url)
 		f = opener.open(url)
 		newchunks = []
 		recvsize = 0
 		while True:
 			data = f.read(100000)
 			if not data:
 				break
 			newchunks.append(data)
 			recvsize += len(data)
 		f.close()

 		if recvsize == clen:
 			verb("success, clen might be larger")
 			guess_min = int(clen/188)
 			chunks = newchunks
 		else:
 			verb("fail, clen is smaller")
 			guess_max = int(clen/188)
 		verb("")
 	else:
 		print("segid="+str(segid)+": Sorry, couldn't find it after "+str(i+1)+" tries. Something is wrong.")
 		sys.exit(1)

 	verb("")
 	if chunks == None:
 		print("Could not find a valid url, none worked.")
 		sys.exit(1)

 	verb("clen="+str(clen))
 	verb("Found the correct url after "+str(i+1)+" tries:")
 	url = re.sub("clen=\d+", "clen="+str(clen), url)
 	print("wget -O "+str(segid)+".ts \""+url+"\"")
 	verb("")

 	# write file
 	with open(str(segid)+".ts", "wb") as f:
 		for chunk in chunks:
 			f.write(chunk)
	# https://gist.github.com/stefansundin/6261442
	# Python 3.x
	# Binary search for the correct clen parameter for YouTube live streams, which is useful when a segment is no longer listed in the playlist file (check EXT-X-EARLIEST-MEDIA-SEQUENCE).
	# It's quick and dirty. clen is always a multiple of 188 (since it is TS). lmt and dur does not have any impact it seems.
	# Usually finds the correct url in about 15 tries.
	# See http://stefansundin.com/blog/452#comments
	# TODO: add threading to process multiple sequence numbers at the same time.

	"""
	Regexps to extract clen and dur from a playlist:

	Search: #.+\n
	Replace with empty string

	Search : .+clen=(\d+).+
	Replace: \1

	Search : .+clen=(\d+).+dur=(\d).+
	Replace: \1\t\2


	Some dur variations:

	dur=8.333: 30 fps, 249 frames, 396800 audio samples
	same for dur=8.334

	dur=6.007: 30 fps, 179 frames, 284800 audio samples
	dur=3.003: 30 fps, 89 frames, 140800 audio samples

	dur=6: 25 fps, 148 frames, 284160 audio samples
	dur=4: 25 fps, 73 frames, 140160 audio samples

	audio is always 48000 Hz
	"""

	import re, urllib.request, sys, time

	def verb(text):
	if verbose:
	print(text)


	# I get '403 Forbidden' for some streams, but adding a cookie fixes that. Happens randomly.

	opener = urllib.request.build_opener()
	opener.addheaders = [("Cookie", "VISITOR_INFO1_LIVE=ktfPrjH8eoc;")]


	url = "http://www.youtube.com/videoplayback/id/cE00Me8FOo0.1/itag/94/source/yt_live_broadcast/sq/4760/file/seg.ts?ratebypass=yes&cmbypass=yes&newshard=yes&hls_chunk_host=www.youtube.com&gir=yes&dg_shard=cE00Me8FOo0.1_94&playlist_type=DVR&maudio=1&pmbypass=yes&cp=U0hWS1hQT19HUENONl9QSlNKOm1xZVN0bkk3am1l&upn=aaTidjy5_gs&fexp=900064,910825,906000,909546,906397,929117,929121,929906,929907,929922,929127,929129,929131,929930,925720,925722,925718,925714,929917,929919,929933,912521,932306,913428,904830,919373,930803,908536,904122,938701,911423,909549,900816,912711,935802,904494&sver=3&cpn=uD8wf5_9faz8nSDl&ip=130.240.207.250&ipbits=8&expire=1376851926&sparams=ip,ipbits,expire,id,itag,source,ratebypass,live,cmbypass,newshard,hls_chunk_host,gir,dg_shard,playlist_type,maudio,pmbypass,cp&signature=4DF70AF061A666E47201AA68C65CD1313CEEE54C.979D6A004FE7B53E28C2E1FF271ACFE86ED02FEC&key=dg_yt0&live=1&lmt=1376829143883697&clen=647848&dur=6.007"

	start = 1100
	num = 5



	# Note: you can't change the itag value since the signature will be incorrect, get a new url

	itag = re.search("itag/(\d+)/", url)
	if itag == None:
	print("Warning: Could not find itag value (quality)")
	else:
	itag = itag.groups()[0]
	itags = {"92":"240p", "93":"360p", "94":"480p", "95":"720p", "96":"1080p"}
	if itag in itags:
	print("Note: itag="+itag+" ("+itags[itag]+")")
	else:
	print("Warning: Unknown itag="+itag)


	for segid in range(start,start+num):
	url = re.sub("/sq/\d+/", "/sq/"+str(segid)+"/", url)


	# will be multiplied with 188
	guess_min = 10
	guess_max = 22310 # for full HD this is can be about 4 MB
	verbose = True


	# sanity checks
	# these checks do not always work for some reason (especially in HD)
	"""
	clen = 188*guess_min
	verb("sanity check, guess clen="+str(clen))
	url = re.sub("clen=\d+", "clen="+str(clen), url)
	f = opener.open(url)
	data = f.read(100)
	f.close()
	if len(data) == 0:
	print("segid="+str(segid)+": clen="+str(clen)+" did not work, this segment is probably no longer accessible.")
	sys.exit(1)

	clen = 188*guess_max
	verb("sanity check, guess clen="+str(clen))
	url = re.sub("clen=\d+", "clen="+str(clen), url)
	f = opener.open(url)
	data = f.read(100)
	f.close()
	if len(data) != 0:
	print("segid="+str(segid)+": clen="+str(clen)+" worked, please increase guess_max.")
	sys.exit(1)
	"""


	chunks = None
	for i in range(100):
	verb("guess_min: "+str(guess_min))
	verb("guess_max: "+str(guess_max))

	if guess_min+1 == guess_max:
	clen = 188*guess_min
	break

	clen = 188*int(guess_min+(guess_max-guess_min)/2)
	verb("guess clen="+str(clen))

	url = re.sub("clen=\d+", "clen="+str(clen), url)
	print(url)
	f = opener.open(url)
	newchunks = []
	recvsize = 0
	while True:
	data = f.read(100000)
	if not data:
	break
	newchunks.append(data)
	recvsize += len(data)
	f.close()

	if recvsize == clen:
	verb("success, clen might be larger")
	guess_min = int(clen/188)
	chunks = newchunks
	else:
	verb("fail, clen is smaller")
	guess_max = int(clen/188)
	verb("")
	else:
	print("segid="+str(segid)+": Sorry, couldn't find it after "+str(i+1)+" tries. Something is wrong.")
	sys.exit(1)

	verb("")
	if chunks == None:
	print("Could not find a valid url, none worked.")
	sys.exit(1)

	verb("clen="+str(clen))
	verb("Found the correct url after "+str(i+1)+" tries:")
	url = re.sub("clen=\d+", "clen="+str(clen), url)
	print("wget -O "+str(segid)+".ts \""+url+"\"")
	verb("")

	# write file
	with open(str(segid)+".ts", "wb") as f:
	for chunk in chunks:
	f.write(chunk)