Created
December 31, 2016 11:29
-
-
Save X-Wei/46817a6614e3677391ab13e420b4cb9f to your computer and use it in GitHub Desktop.
jikexueyuan video downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import requests | |
from lxml import etree | |
import os, time, sys | |
import cPickle as pk | |
reload(sys) | |
sys.setdefaultencoding('utf-8') # to avoid encoding problems | |
hea = { | |
'Host': 'www.jikexueyuan.com', | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.90 Safari/537.36', | |
'Connection': 'keep-alive', | |
'Cookie': 'gr_user_id=87ea5cdc-d227-4bed-8e9d-369149cc3fda; stat_uuid=1468588589952565216740; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221560a141fc686-0baf1a4503416-24414032-15f900-1560a141fc88%22%7D; Hm_lvt_f3c68d41bda15331608595c98e9c3915=1469047902; looyu_id=e0e3fc304a4c5523ddf6454e3fbdad6579_20001269%3A5; undefined=; connect.sid=s%3AeXmKsrfkcmNUVGMVdQ04ZaXFXT0znDBF.ChCxa6%2F9sHM4PQPmOy3r0lPFAKKvek6%2Bvy8lPMvK3t8; QINGCLOUDELB=84b10773c6746376c2c7ad1fac354ddfd562b81daa2a899c46d3a1e304c7eb2b|WCroE|WCroE; _ga=GA1.2.1510658653.1468588590; _gat=1; gr_session_id_aacd01fff9535e79=258f42eb-cd9f-4f17-97f0-9bd5bf9cc3b1; uname=jike_jddymx; uid=3514144; code=MPDZNL; authcode=0198Sfj%2FuyDWiApnRaJJxScBWwrupZo9w0oQkE1yBaN9D8tnIT4RmcJf6%2BpbYw4Ba711cMARquMX1reWaTrFa3JLAaqFDZRba3CZszzbnK9P2MgT8ua1Fq4T213pC0k; avatar=http%3A%2F%2Fassets.jikexueyuan.com%2Fuser%2Favtar%2Fdefault.gif; ca_status=0; vip_status=1; level_id=3; is_expire=0; domain=0Jjgkqkqq', | |
} | |
# useful help function | |
def download((url, fpath), headers={}): | |
fname = os.path.split(fpath)[-1] | |
print 'start downloading %s ...' % fname | |
with open(fpath, 'wb') as f: | |
while 1: | |
resp = requests.get(url, stream=True, headers=headers); time.sleep(1.0) | |
if resp.ok: break | |
print resp.status_code | |
for chunk in resp.iter_content(chunk_size=1024): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
print 'download finished: %s' % fpath | |
def gethtml(url, delay=0.01): | |
while 1: | |
try: | |
html = requests.get(url, headers=hea).content | |
time.sleep(delay) # avoid Error 104 (reset by peer) | |
break | |
except Exception: pass | |
return html | |
def geturls_course(course_url = 'http://www.jikexueyuan.com/course/1287.html', folder='videos', idx = None): | |
html = gethtml(course_url) | |
sel = etree.HTML(html) | |
course_name = sel.xpath('//span[@class="tit"]/text()')[0].strip().encode('utf-8') | |
if idx is not None: | |
course_name = '%d-%s' % (idx, course_name) | |
video_folder = os.path.join(folder, course_name) | |
if not os.path.exists(video_folder): | |
os.makedirs(video_folder) | |
print 'getting course "%s"......' % course_name | |
lessons = sel.xpath('//div[@class="lesson-box"]/ul/li/div/h2/a') | |
video_urls_list = [] # contains (url,fpath) tuple | |
for i,les in enumerate(lessons,1): | |
les_name = les.xpath('string(.)').strip().encode('utf-8') | |
sys.stdout.write('\r %d-%s...' % (i,les_name)); sys.stdout.flush() | |
les_url = les.xpath('@href')[0] | |
les_html = gethtml(les_url) | |
video_url = etree.HTML(les_html).xpath('//source/@src')[0] | |
fpath = os.path.join(video_folder, '%d.%s.mp4'%(i,les_name)) | |
video_urls_list.append( (str(video_url), str(fpath)) ) | |
print '' | |
print 'All video urls in course "%s" got!' % course_name | |
# get course zip file | |
def download_course_zip(courseid): | |
json_url = 'http://www.jikexueyuan.com/course/downloadRes?course_id=%d' % courseid | |
import json | |
jsdict = json.loads( gethtml(json_url) ) | |
if 'url' not in jsdict['data'] : | |
return # some courses don't have a zip file to download | |
zip_url = jsdict['data']['url'] | |
fpath = os.path.join(video_folder, '%s.zip' % course_name) | |
download( (zip_url, fpath), headers=hea ) | |
#~ courseid = int( course_url[:-5].split('/')[-1] ) | |
#~ download_course_zip(courseid) # downloading zip always get 403 Error ! | |
download_videos(video_urls_list) | |
return video_urls_list | |
def geturls_series(series_url='http://ke.jikexueyuan.com/xilie/116'): | |
html = requests.get(series_url).content; time.sleep(0.01) | |
sel = etree.HTML(html) | |
series_name = sel.xpath('//dd/h2/text()')[0].strip() | |
print '===getting series "%s"===' % series_name | |
folder = os.path.join('videos', 'series-' + series_name) | |
courses = sel.xpath('//div[@class="lesson-item"]/a/@href') | |
video_urls_list = [] # contains (url,fpath) tuple | |
for i, course_url in enumerate(courses, 1): | |
print i, course_url | |
video_urls_list += geturls_course(course_url, folder=folder , idx=i) | |
time.sleep(0.2) | |
print '===all video urls in series "%s" got===' % series_name | |
with open('%s-urls.pk'%series_name, 'wb') as f: | |
pk.dump(video_urls_list, f) | |
return video_urls_list | |
def geturls_career_path(career_url='http://ke.jikexueyuan.com/zhiye/android/'): | |
''' | |
course_url_xpath = '//section[@class="lesson-unit"]/table[@class="table lesson-step"]//a[@class="inner"]/@href' | |
step_title_xpath = '//section[@class="lesson-unit"]/table[@class="table lesson-step"]//th/text()' | |
unit_title_xpath = '//section[@class="lesson-unit"]//h3/text()' | |
''' | |
html = requests.get(career_url).content; time.sleep(0.01) | |
sel = etree.HTML(html) | |
career_path_name = 'career-path-' + sel.xpath('//h1[@class="caption"]/text()')[0].strip() | |
units = sel.xpath('//section[@class="lesson-unit"]') | |
video_urls_list = [] | |
for unit in units: | |
unit_name = unit.xpath('header/h3/text()')[0].strip() | |
print '===unit %s===' % unit_name | |
steps = unit.xpath('table[@class="table lesson-step"]') | |
for s, step in enumerate(steps,1): | |
step_name = '%d-%s' % (s, step.xpath('thead/tr/th/text()')[0].strip() ) | |
print '===step %s===' % step_name | |
for i, course_url in enumerate(step.xpath('.//a[@class="inner"]/@href'), 1): | |
folder = os.path.join('videos', career_path_name, unit_name, step_name) | |
print i, course_url | |
video_urls_list += geturls_course(course_url, folder=folder , idx=i) | |
time.sleep(0.2) | |
print '===step %s finished===' % step_name | |
print '===unit %s finished===' % unit_name | |
with open('%s-urls.pk'%career_path_name, 'wb') as f: | |
pk.dump(video_urls_list, f) | |
return video_urls_list | |
def download_videos(video_urls_list): | |
print 'downloading %d files in parallel...' % len(video_urls_list) | |
from multiprocessing import Pool | |
pool = Pool(processes=4) | |
pool.map(download, video_urls_list) | |
pool.close() | |
pool.join() | |
print 'all downloading finished !' | |
if __name__ == '__main__': | |
#~ geturls_series('http://ke.jikexueyuan.com/xilie/108') | |
video_urls_list = geturls_career_path('http://ke.jikexueyuan.com/zhiye/web/') | |
download_videos(video_urls_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment