python 抓取 coursera 字幕
Last active
April 11, 2018 05:43
-
-
Save BadUncleX/f0fb1afdfe3770cd2c74 to your computer and use it in GitHub Desktop.
python 抓取 coursera 字幕 (machine learning)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## 用2to3 从版本2转到3 | |
#Download all subtitle of videos on coursera for Machine Learning Course by Andrew Ng | |
#Author: Hebi | |
#Note: Before sunning the script create a Folder Named: 'subtitle' in the same directory | |
import urllib.request, urllib.error, urllib.parse | |
import os | |
import base64 | |
url = 'https://class.coursera.org/ml-008/lecture/subtitles?q=' | |
url2 = '_zh&format=txt' | |
#url2 = '_en&format=txt' | |
print("Downloaded Subtitle No. ", end=' ') | |
for i in range(1,115): | |
try: | |
# urllib2.unquote('%20') | |
remoteurl = urllib.request.urlopen(url+str(i)+url2) | |
cd = remoteurl.info()['Content-Disposition'].split(';') | |
cd2 = cd[1].split('=')[1].strip("\"'") | |
filename = urllib.parse.unquote(cd2).replace("/","_") | |
print("filename:", filename) | |
# print "remoteurl.url,", remoteurl.url | |
# print "parsename:",urllib2.urlparse.urlparse(remoteurl.url).path | |
# filename = os.path.basename(urllib2.urlparse.urlparse(remoteurl.url).path) | |
# print "filename,", filename | |
#filename = cd['filename'].strip("\"'") | |
page = remoteurl.read(); | |
f = open('subtitle-cn/'+filename,'w+'); | |
f.write(page); | |
f.close(); | |
print(i, end=' '); | |
except (urllib.error.HTTPError, urllib.error.URLError) as e: | |
print('\nProblem Downloading file: ',i) | |
print('OR Connection Error') | |
print('\nDownload Finished\nHappy Coding!!!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment