Skip to content

Instantly share code, notes, and snippets.

@scturtle
Created October 11, 2011 02:59
Show Gist options
  • Save scturtle/1277162 to your computer and use it in GitHub Desktop.
Save scturtle/1277162 to your computer and use it in GitHub Desktop.
download ml-class subtitles and convert to srt
# coding: utf-8
import os,sys,re,codecs
limit=[60,60,60,1000]
def xml2srt(fi,fo):
data=''.join((fi.read().split('\n')[9:-4])).strip().split('</p>')
for i in range(0,len(data)-1):
#print i,data[i]
if data[i]:
st_st=data[i].index('"')
st_ed=data[i].index('"',st_st+1)
if i+1<len(data)-1:
nx_st=data[i+1].index('"')
nx_ed=data[i+1].index('"',nx_st+1)
fo.write(str(i+1)+' \n')
stamps=[data[i][st_st+1:st_ed],
data[i+1][nx_st+1:nx_ed] if i+1<len(data)-1 else "99:59:59.999"]
word=data[i][data[i].index('>')+1:].replace('\n',' ')+' \n\n\n'
for i,stamp in enumerate(stamps):
stamp=stamp.split('.')
stamps[i]=map(int,stamp[0].split(':'))
stamps[i].append(int(stamp[1]))
stamps=map(lambda s:"%02d:%02d:%02d,%03d" % tuple(s),stamps)
fo.write("%s --> %s \n" % tuple(stamps))
fo.write(word)
#print 'OK!'
if __name__=='__main__':
for fn in os.listdir('.'):
if fn[-4:]!='.xml':
continue
print 'Converting:',fn[:-4]
fi=file(fn,'r')
if os.path.exists(fn[:-4]+'.srt'):
continue
fo=file(fn[:-4]+'.srt','w')
xml2srt(fi,fo)
fo.close()
print 'Done'
# coding: utf-8
import cookielib,urllib2
from cStringIO import StringIO
from pysqlite2 import dbapi2 as sqlite
import re,os
# a useful function from others
def sqlite2cookie(filename,host):
con = sqlite.connect(filename)
con.text_factory = str
cur = con.cursor()
cur.execute("select host, path, isSecure, expiry, name, value from moz_cookies where host like ?"
,['%%%s%%' % host])
ftstr = ["FALSE","TRUE"]
s = StringIO()
s.write("""\
# Netscape HTTP Cookie File
# http://www.netscape.com/newsref/std/cookie_spec.html
# This is a generated file! Do not edit.
""")
for item in cur.fetchall():
s.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (
item[0], ftstr[item[0].startswith('.')], item[1],
ftstr[item[2]], item[3], item[4], item[5]))
s.seek(0)
#print "cookie:",s.read() ;s.seek(0)
cookie_jar = cookielib.MozillaCookieJar()
cookie_jar._really_load(s, '', True, True)
return cookie_jar
# get cookie
cookiejar = sqlite2cookie(r'C:\Users\lenovo\AppData\Roaming\Mozilla\Firefox\Profiles\osfuexqh.default\cookies.sqlite','ml-class.org')
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
urllib2.install_opener(opener)
# post cookie
print 'Getting page, if it takes too long time, kill me!'
content=urllib2.urlopen('http://www.ml-class.org/course/video/list?mode=view').read()
#print content
content=''.join(content.split('\n'))
print "Page got!"
# get file name
namefinder=re.compile(r"file: ([^,]*),")
found=namefinder.findall(content)
print 'Number of files:',len(found)
found=map(lambda s: s[1:-1],found)
#print found
# down xml
baseurl='http://s3.amazonaws.com/stanford_videos/cs229/subtitles/%s-subtitles.xml'
for i,fn in enumerate(found):
fn=fn.replace(r'\'','\'')
outfile=fn+'.xml'
if os.path.exists(outfile):
continue
print 'Getting:',fn
fo=file(outfile,'w')
fo.write(urllib2.urlopen(baseurl % fn).read())
fo.close()
print 'Done:',i,outfile
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment