Created
March 8, 2011 19:03
-
-
Save guanix/860790 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Author: Ricardo Garcia Gonzalez | |
# Author: Danny Colligan | |
# Author: Benjamin Johnson | |
# Author: Vasyl' Vavrychuk | |
# Author: Witold Baryluk | |
# Author: Paweł Paprota | |
# Author: Guan Yang | |
# Author: David Triendl | |
# License: Public domain code | |
# The total amount of time it should take to fetch one timestamp, in seconds | |
TIME=1.1 | |
import datetime | |
import email.utils | |
import htmlentitydefs | |
import os, sys, time | |
import timeit | |
import re | |
import socket | |
import urllib | |
import urllib2 | |
import tempfile | |
import httplib | |
class YahooException(Exception): | |
def __init__(self, value): | |
self.parameter = value | |
def __str__(self): | |
return repr(self.parameter) | |
class HeadRequest(urllib2.Request): | |
def get_method(self): | |
return "HEAD" | |
def htmlentity_transform(matchobj): | |
"""Transforms an HTML entity to a Unicode character. | |
This function receives a match object and is intended to be used with | |
the re.sub() function. | |
""" | |
entity = matchobj.group(1) | |
# Known non-numeric HTML entity | |
if entity in htmlentitydefs.name2codepoint: | |
return unichr(htmlentitydefs.name2codepoint[entity]) | |
# Unicode character | |
mobj = re.match(ur'(?u)#(x?\d+)', entity) | |
if mobj is not None: | |
numstr = mobj.group(1) | |
if numstr.startswith(u'x'): | |
base = 16 | |
numstr = u'0%s' % numstr | |
else: | |
base = 10 | |
return unichr(long(numstr, base)) | |
# Unknown entity in name, return its literal representation | |
return (u'&%s;' % entity) | |
def timeconvert(timestr): | |
"""Convert RFC 2822 defined time string into system timestamp""" | |
timestamp = None | |
timetuple = email.utils.parsedate_tz(timestr) | |
if timetuple is not None: | |
timestamp = email.utils.mktime_tz(timetuple) | |
return timestamp | |
def get_cdn_url(video_id): | |
"""Returns the CDN URL for a given video ID. | |
Fetches some magic Yahoo! Video site to get the super-secret CDN URL. Code | |
shamelessly stolen from youtube-dl. | |
""" | |
yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' | |
yv_bitrate = '700' | |
yv_video_height = '200' | |
yv_video_width = '200' | |
request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +'&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') | |
try: | |
webpage = urllib2.urlopen(request).read() | |
except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
raise YahooException('Could not fetch playlist file') | |
mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage) | |
if mobj is None: | |
raise YahooException('Could not parse playlist file') | |
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') | |
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) | |
return video_url | |
def get_timestamp(cdn_url): | |
"""Returns the timestamp of a video file. | |
Gives HEAD to Yahoo to get the timestamp for a video file and returns it as | |
integer in seconds since the beginning of the Unix epoch. | |
""" | |
request = HeadRequest(cdn_url) | |
try: | |
result = urllib2.urlopen(request) | |
except (urllib2.URLError, httplib.HTTPException, socket.error), err: | |
raise YahooException('Failed to fetch information from CDN') | |
timestring = result.info().get('last-modified') | |
if timestring == None: | |
raise YahooException('Yahoo failed to mention when the file was last-modified.') | |
timestamp = timeconvert(timestring) | |
if timestamp == None: | |
raise YahooException('Stupid Yahoo, why you no learn how to use a calendar?') | |
return int(timestamp) | |
def fetch_timestamp(video_id): | |
cdn_url = get_cdn_url(video_id) | |
return get_timestamp(cdn_url) | |
# rebind our socket, if no IP is specified use default | |
true_socket = socket.socket | |
def bound_socket(*a, **k): | |
sock = true_socket(*a, **k) | |
ip = os.environ.get('BIND') | |
if not ip: ip = "0.0.0.0" | |
sock.bind((ip, 0)) | |
return sock | |
socket.socket = bound_socket | |
if len(sys.argv) != 2: | |
sys.stderr.write("./timestamp.py missing-timestamps.txt\n") | |
sys.exit(1) | |
ro = re.compile(r"^(\d+) (\d+)") | |
fd = open(sys.argv[1], "r") | |
for line in fd: | |
t1 = time.clock() | |
mo = ro.match(line) | |
if not mo: continue | |
user_id = mo.group(1) | |
video_id = mo.group(2) | |
ts = fetch_timestamp(video_id) | |
sys.stdout.write("%s %s %d\n" % (user_id, video_id, ts)) | |
sys.stdout.flush() | |
sys.stderr.write("timestamp for video %s was %d. " % (video_id, ts)) | |
t2 = time.clock() | |
delta = t2 - t1 | |
if delta < TIME: | |
slp = TIME - delta | |
else: | |
slp = 0.1 | |
sys.stderr.write("took %0.4f s, sleeping for %0.4f s\n" % (delta, slp)) | |
time.sleep(TIME-delta) | |
fd.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment