guanix · March 8, 2011 19:03
diff --git a/timestamp.py b/timestamp.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Author: Ricardo Garcia Gonzalez
 # Author: Danny Colligan
 # Author: Benjamin Johnson
 # Author: Vasyl' Vavrychuk
 # Author: Witold Baryluk
 # Author: Paweł Paprota
 # Author: Guan Yang
 # Author: David Triendl
 # License: Public domain code

 # The total amount of time it should take to fetch one timestamp, in seconds
 TIME=1.1

 import datetime
 import email.utils
 import htmlentitydefs
 import os, sys, time
 import timeit
 import re
 import socket
 import urllib
 import urllib2
 import tempfile
 import httplib

 class YahooException(Exception):
    def __init__(self, value):
        self.parameter = value
    def __str__(self):
        return repr(self.parameter)

 class HeadRequest(urllib2.Request):
    def get_method(self):
        return "HEAD"

 def htmlentity_transform(matchobj):
    """Transforms an HTML entity to a Unicode character.

    This function receives a match object and is intended to be used with
    the re.sub() function.
    """
    entity = matchobj.group(1)

    # Known non-numeric HTML entity
    if entity in htmlentitydefs.name2codepoint:
        return unichr(htmlentitydefs.name2codepoint[entity])

    # Unicode character
    mobj = re.match(ur'(?u)#(x?\d+)', entity)
    if mobj is not None:
 		numstr = mobj.group(1)
 		if numstr.startswith(u'x'):
 			base = 16
 			numstr = u'0%s' % numstr
 		else:
 			base = 10
 		return unichr(long(numstr, base))

 	# Unknown entity in name, return its literal representation
    return (u'&%s;' % entity)

 def timeconvert(timestr):
    """Convert RFC 2822 defined time string into system timestamp"""
    timestamp = None
    timetuple = email.utils.parsedate_tz(timestr)
    if timetuple is not None:
        timestamp = email.utils.mktime_tz(timetuple)
    return timestamp

 def get_cdn_url(video_id):
    """Returns the CDN URL for a given video ID.

    Fetches some magic Yahoo! Video site to get the super-secret CDN URL. Code
    shamelessly stolen from youtube-dl.
    """
    yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'
    yv_bitrate = '700'
    yv_video_height = '200'
    yv_video_width = '200'
    request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')

    try:
        webpage = urllib2.urlopen(request).read()
    except (urllib2.URLError, httplib.HTTPException, socket.error), err:
        raise YahooException('Could not fetch playlist file')

    mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
    if mobj is None:
        raise YahooException('Could not parse playlist file')
    video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
    video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)

    return video_url

 def get_timestamp(cdn_url):
    """Returns the timestamp of a video file.

    Gives HEAD to Yahoo to get the timestamp for a video file and returns it as
    integer in seconds since the beginning of the Unix epoch.
    """
    request = HeadRequest(cdn_url)
    try:
        result = urllib2.urlopen(request)
    except (urllib2.URLError, httplib.HTTPException, socket.error), err:
        raise YahooException('Failed to fetch information from CDN')

    timestring = result.info().get('last-modified')
    if timestring == None:
        raise YahooException('Yahoo failed to mention when the file was last-modified.')

    timestamp = timeconvert(timestring)
    if timestamp == None:
        raise YahooException('Stupid Yahoo, why you no learn how to use a calendar?')

    return int(timestamp)

 def fetch_timestamp(video_id):
    cdn_url = get_cdn_url(video_id)
    return get_timestamp(cdn_url)

 # rebind our socket, if no IP is specified use default
 true_socket = socket.socket
 def bound_socket(*a, **k):
    sock = true_socket(*a, **k)
    ip = os.environ.get('BIND')
    if not ip: ip = "0.0.0.0"
    sock.bind((ip, 0))
    return sock
 socket.socket = bound_socket

 if len(sys.argv) != 2:
    sys.stderr.write("./timestamp.py missing-timestamps.txt\n")
    sys.exit(1)

 ro = re.compile(r"^(\d+) (\d+)")

 fd = open(sys.argv[1], "r")
 for line in fd:
    t1 = time.clock()
    mo = ro.match(line)
    if not mo: continue

    user_id = mo.group(1)
    video_id = mo.group(2)

    ts = fetch_timestamp(video_id)
    sys.stdout.write("%s %s %d\n" % (user_id, video_id, ts))
    sys.stdout.flush()
    sys.stderr.write("timestamp for video %s was %d. " % (video_id, ts))
    t2 = time.clock()

    delta = t2 - t1
    if delta < TIME:
        slp = TIME - delta
    else:
        slp = 0.1

    sys.stderr.write("took %0.4f s, sleeping for %0.4f s\n" % (delta, slp))
    time.sleep(TIME-delta)

 fd.close()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# Author: Ricardo Garcia Gonzalez
	# Author: Danny Colligan
	# Author: Benjamin Johnson
	# Author: Vasyl' Vavrychuk
	# Author: Witold Baryluk
	# Author: Paweł Paprota
	# Author: Guan Yang
	# Author: David Triendl
	# License: Public domain code

	# The total amount of time it should take to fetch one timestamp, in seconds
	TIME=1.1

	import datetime
	import email.utils
	import htmlentitydefs
	import os, sys, time
	import timeit
	import re
	import socket
	import urllib
	import urllib2
	import tempfile
	import httplib

	class YahooException(Exception):
	def __init__(self, value):
	self.parameter = value
	def __str__(self):
	return repr(self.parameter)

	class HeadRequest(urllib2.Request):
	def get_method(self):
	return "HEAD"

	def htmlentity_transform(matchobj):
	"""Transforms an HTML entity to a Unicode character.

	This function receives a match object and is intended to be used with
	the re.sub() function.
	"""
	entity = matchobj.group(1)

	# Known non-numeric HTML entity
	if entity in htmlentitydefs.name2codepoint:
	return unichr(htmlentitydefs.name2codepoint[entity])

	# Unicode character
	mobj = re.match(ur'(?u)#(x?\d+)', entity)
	if mobj is not None:
	numstr = mobj.group(1)
	if numstr.startswith(u'x'):
	base = 16
	numstr = u'0%s' % numstr
	else:
	base = 10
	return unichr(long(numstr, base))

	# Unknown entity in name, return its literal representation
	return (u'&%s;' % entity)

	def timeconvert(timestr):
	"""Convert RFC 2822 defined time string into system timestamp"""
	timestamp = None
	timetuple = email.utils.parsedate_tz(timestr)
	if timetuple is not None:
	timestamp = email.utils.mktime_tz(timetuple)
	return timestamp

	def get_cdn_url(video_id):
	"""Returns the CDN URL for a given video ID.

	Fetches some magic Yahoo! Video site to get the super-secret CDN URL. Code
	shamelessly stolen from youtube-dl.
	"""
	yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'
	yv_bitrate = '700'
	yv_video_height = '200'
	yv_video_width = '200'
	request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')

	try:
	webpage = urllib2.urlopen(request).read()
	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	raise YahooException('Could not fetch playlist file')

	mobj = re.search(r'<STREAM APP="(http://.)" FULLPATH="/?(/.\.flv\?[^"]*)"', webpage)
	if mobj is None:
	raise YahooException('Could not parse playlist file')
	video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
	video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)

	return video_url

	def get_timestamp(cdn_url):
	"""Returns the timestamp of a video file.

	Gives HEAD to Yahoo to get the timestamp for a video file and returns it as
	integer in seconds since the beginning of the Unix epoch.
	"""
	request = HeadRequest(cdn_url)
	try:
	result = urllib2.urlopen(request)
	except (urllib2.URLError, httplib.HTTPException, socket.error), err:
	raise YahooException('Failed to fetch information from CDN')

	timestring = result.info().get('last-modified')
	if timestring == None:
	raise YahooException('Yahoo failed to mention when the file was last-modified.')

	timestamp = timeconvert(timestring)
	if timestamp == None:
	raise YahooException('Stupid Yahoo, why you no learn how to use a calendar?')

	return int(timestamp)

	def fetch_timestamp(video_id):
	cdn_url = get_cdn_url(video_id)
	return get_timestamp(cdn_url)

	# rebind our socket, if no IP is specified use default
	true_socket = socket.socket
	def bound_socket(a, *k):
	sock = true_socket(a, *k)
	ip = os.environ.get('BIND')
	if not ip: ip = "0.0.0.0"
	sock.bind((ip, 0))
	return sock
	socket.socket = bound_socket

	if len(sys.argv) != 2:
	sys.stderr.write("./timestamp.py missing-timestamps.txt\n")
	sys.exit(1)

	ro = re.compile(r"^(\d+) (\d+)")

	fd = open(sys.argv[1], "r")
	for line in fd:
	t1 = time.clock()
	mo = ro.match(line)
	if not mo: continue

	user_id = mo.group(1)
	video_id = mo.group(2)

	ts = fetch_timestamp(video_id)
	sys.stdout.write("%s %s %d\n" % (user_id, video_id, ts))
	sys.stdout.flush()
	sys.stderr.write("timestamp for video %s was %d. " % (video_id, ts))
	t2 = time.clock()

	delta = t2 - t1
	if delta < TIME:
	slp = TIME - delta
	else:
	slp = 0.1

	sys.stderr.write("took %0.4f s, sleeping for %0.4f s\n" % (delta, slp))
	time.sleep(TIME-delta)

	fd.close()