Last active
August 29, 2015 14:19
-
-
Save axiaoxin/178a9d499c39b1c348c9 to your computer and use it in GitHub Desktop.
download_teacher_files.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # !/usr/bin/env python | |
| # -*- coding:utf-8 -*- | |
| import urllib2 | |
| import cookielib | |
| import re | |
| import urllib | |
| import os | |
| import Queue | |
| import threading | |
| import time | |
| jar = cookielib.CookieJar() | |
| handler = urllib2.HTTPCookieProcessor(jar) | |
| opener = urllib2.build_opener(handler) | |
| urllib2.install_opener(opener) | |
| dir_urls_set = set() | |
| file_urls_list = [] | |
| def get_id_and_name_list(): | |
| mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp' | |
| mainpage_content = urllib2.urlopen(mainpage_url).read() | |
| pattern = re.compile(r'ID=(.*?)>(.*?)<') | |
| #提取所有老师ID | |
| id_and_name_list = re.finditer(pattern, mainpage_content) | |
| return id_and_name_list | |
| def get_id_by_name(name): | |
| mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp' | |
| mainpage_content = urllib2.urlopen(mainpage_url).read() | |
| pattern = re.compile(r'ID=(.*?)>(%s)<'%name) | |
| id = re.search(pattern, mainpage_content).group(1).split("ID=")[-1] | |
| return id | |
| def login(id, pwd): | |
| url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp?txtId=%s&txtMM=%s'%(id, pwd) | |
| request = urllib2.Request(url) | |
| page = opener.open(request).read() | |
| if 'alert' in page: | |
| print 'password error' | |
| return page | |
| #获取页面中文件夹的地址 | |
| def get_dir_urls(page): | |
| pattern = re.compile(r'(?P<dir_url>fileman\.asp\?Type=Js&ID=(?P<id>.+)&dir=%5C(?P<dir_path>.+))>(?P<dir_name>.+)</a> ?') | |
| dir_urls = re.findall(pattern, page) | |
| if dir_urls: | |
| return dir_urls | |
| return [] | |
| #获取页面中文件的地址 | |
| def add_file_urls(page): | |
| pattern = re.compile(r"(?P<file_url>fileman.asp\?Type=Js&ID=(?P<id>.+)&a=dl&f=%5C(?P<file_name>.+%2E(?P<file_type>.+?)))>") | |
| file_urls = re.findall(pattern, page) | |
| file_urls_list.extend(file_urls) | |
| def download(file_url, save_path): | |
| teacher_name = '' | |
| global names, ids | |
| for name, id in zip(names, ids): | |
| if teacher_name: | |
| continue | |
| if id == file_url[1]: | |
| teacher_name = name | |
| save_path = '%s/%s'%(save_path, teacher_name) | |
| if not os.path.exists(save_path): | |
| os.makedirs(save_path) | |
| data = opener.open('http://wlcc.cuit.edu.cn/Share/%s'%(file_url[0])).read() | |
| file_name = urllib.unquote(file_url[2]).split('\\')[-1] | |
| f = open('%s/%s'%(save_path, file_name), 'wb') | |
| f.write(data) | |
| f.close() | |
| print '%s:%s is downloaded!'%(threading.currentThread().getName(), file_name) | |
| def download_files(file_url, file_type='', file_name='', save_path='.'): | |
| #根据文件名下载 | |
| if file_name and not file_type: | |
| if urllib.unquote(file_url[2]).split('\\')[-1] == file_name: | |
| download(file_url, save_path) | |
| #根据文件格式下载 | |
| if file_type and not file_name: | |
| if file_url[3] == file_type: | |
| download(file_url, save_path) | |
| #下载全部 | |
| if not file_name and not file_type: | |
| download(file_url, save_path) | |
| #输入老师名字,为空则检验所有使用某密码的老湿 | |
| names = raw_input('teacher:').split() | |
| pwd = raw_input('password:') | |
| ids = [] | |
| pages = [] | |
| #验证密码 | |
| def do_check(name, id, pwd): | |
| #print u'%s:正在验证%s的密码'%(threading.currentThread().getName(), name.decode('gbk')) | |
| check_login_url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp' | |
| post_datas = "txtId=%s&txtMM=%s"%(id, pwd) | |
| request = urllib2.Request(url = check_login_url, data = post_datas) | |
| content = opener.open(request).read() | |
| #密码错误会有一个alert,没有则密码正确进入了页面 | |
| global ids, names, pages | |
| if 'alert' not in content: | |
| names.append(name) | |
| ids.append(id) | |
| pages.append(content) | |
| #线程池 | |
| class CheckManager(object): | |
| def __init__(self, id_and_name_list, pwd, thread_num = 5): | |
| self.pwd = pwd | |
| self.check_queue = Queue.Queue() | |
| self.threads = [] | |
| #按数量将要处理的数据和处理方法加入工作队列 | |
| self.__init_check_queue(id_and_name_list) | |
| #按数量将启动的线程加入线程池中 | |
| self.__init_thread_pool(thread_num) | |
| def __init_check_queue(self, the_list): | |
| for item in the_list: | |
| #将处理函数和参数元组加入队列,以便Check线程中重新调用 | |
| self.check_queue.put((do_check, (item.group(2), item.group(1), self.pwd))) | |
| def __init_thread_pool(self, thread_num): | |
| for i in range(thread_num): | |
| #线程自启动并加入线程列表 | |
| self.threads.append(Check(self.check_queue)) | |
| #等待所有线程运行完毕 | |
| def wait_all_complete(self): | |
| for t in self.threads: | |
| if t.isAlive(): | |
| t.join() | |
| class Check(threading.Thread): | |
| def __init__(self, check_queue): | |
| threading.Thread.__init__(self) | |
| self.check_queue = check_queue | |
| self.start() | |
| def run(self): | |
| while True: | |
| try: | |
| #取出函数对象和对应的每次不同的参数 | |
| check, args = self.check_queue.get(block=False) #block为True调用者将阻塞,直到队列出现可用的空闲,为False时,队列空时引发异常。 | |
| #重新调用 | |
| check(*args) | |
| self.check_queue.task_done() | |
| except: | |
| break | |
| print '=== login checking...' | |
| if names: | |
| for name in names: | |
| #根据输入的名字获取对应的id | |
| id = get_id_by_name(name) | |
| #登录课件页,返回页面内容 | |
| page = login(id, pwd) | |
| ids.append(id) | |
| pages.append(page) | |
| else: | |
| id_and_name_list = get_id_and_name_list() | |
| check_manager = CheckManager(id_and_name_list, pwd) | |
| check_manager.wait_all_complete() | |
| print '=== check over!' | |
| #初始化文件夹地址列表 | |
| def init_dirs_list(page): | |
| dir_urls_set.update(get_dir_urls(page)) | |
| for dir_url in list(dir_urls_set): | |
| if u'学生上传作业'.encode('gbk') not in urllib.unquote(dir_url[0]): | |
| request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0]) | |
| page = opener.open(request).read() | |
| dir_urls_set.update(get_dir_urls(page)) | |
| #根据文件夹地址列表遍历初始化文件地址列表 | |
| def init_files_list(page): | |
| add_file_urls(page) | |
| init_dirs_list(page) | |
| for dir_url in list(dir_urls_set): | |
| request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0]) | |
| page = opener.open(request).read() | |
| add_file_urls(page) | |
| class InitFilesListManager(object): | |
| def __init__(self, pages, thread_num = 5): | |
| self.init_queue = Queue.Queue() | |
| self.threads = [] | |
| self.__init_queue(pages) | |
| self.__init_thread_pool(thread_num) | |
| def __init_queue(self, pages): | |
| for page in pages: | |
| self.init_queue.put((init_files_list, page)) | |
| def __init_thread_pool(self, thread_num): | |
| for i in range(thread_num): | |
| self.threads.append(Init(self.init_queue)) | |
| def wait_all_complete(self): | |
| for t in self.threads: | |
| if t.isAlive(): | |
| t.join() | |
| class Init(threading.Thread): | |
| def __init__(self, init_queue): | |
| threading.Thread.__init__(self) | |
| self.init_queue = init_queue | |
| self.start() | |
| def run(self): | |
| while True: | |
| try: | |
| do, arg = self.init_queue.get(block=False) | |
| do(arg) | |
| self.init_queue.task_done() | |
| except Exception, e: | |
| #print e | |
| break | |
| def show_files(): | |
| global file_urls_list | |
| file_urls_list = sorted(list(set(file_urls_list))) | |
| global names, ids | |
| teacher_name = '' | |
| #转换名字列表为字符串作为保存信息的文件名 | |
| save_file_name = '-'.join([str(x) for x in names])#str(names).replace("'", '').decode('gbk') | |
| #将信息保存到文件 | |
| file_infos = open(u'./教师课件信息[%s].txt'%save_file_name.decode('gbk'), 'w') | |
| for i in file_urls_list: | |
| for name, id in zip(names, ids): | |
| if i[1] == id: | |
| teacher_name = name | |
| file_info = '%s : %s : %s\n'%(teacher_name, pwd, urllib.unquote(i[2]).split('\\')[-1]) | |
| print file_info, | |
| file_infos.write(file_info) | |
| file_infos.close() | |
| print u'=== 共%d个课件'%len(file_urls_list) | |
| #下载线程池,感觉和单线程的效率差不多,还不清楚原因 | |
| class DownloadManager(object): | |
| #默认开启五个线程,保存路径为当前脚本目录下的老师名字文件夹内 | |
| def __init__(self, file_urls_list, file_types, file_names, thread_num = 10, save_path='./downloads'): | |
| self.download_queue = Queue.Queue() | |
| self.threads = [] | |
| self.file_urls_list = file_urls_list | |
| self.file_types = file_types | |
| self.file_names = file_names | |
| self.save_path = save_path | |
| self.__init_download_queue(file_urls_list) | |
| self.__init_thread_pool(thread_num) | |
| def __init_download_queue(self, the_list): | |
| if self.file_types: | |
| for item in the_list: | |
| for file_type in self.file_types: | |
| self.download_queue.put((download_files, (item, file_type, '', self.save_path))) | |
| if self.file_names: | |
| for item in the_list: | |
| for file_name in self.file_names: | |
| self.download_queue.put((download_files, (item, '', file_name, self.save_path))) | |
| if not self.file_names and not self.file_types: | |
| for item in the_list: | |
| self.download_queue.put((download_files, (item, '', '', self.save_path))) | |
| def __init_thread_pool(self, thread_num): | |
| for i in range(thread_num): | |
| self.threads.append(Download(self.download_queue)) | |
| def wait_all_complete(self): | |
| for t in self.threads: | |
| if t.isAlive(): | |
| t.join() | |
| class Download(threading.Thread): | |
| def __init__(self, download_queue): | |
| threading.Thread.__init__(self) | |
| self.download_queue = download_queue | |
| self.start() | |
| def run(self): | |
| while True: | |
| try: | |
| do, args = self.download_queue.get(block=False) | |
| do(*args) | |
| self.download_queue.task_done() | |
| except Exception, e: | |
| #print e | |
| break | |
| #初始化给定老湿的所有课件 | |
| stime = time.time() | |
| init_manager = InitFilesListManager(pages) | |
| print '=== getting files list...' | |
| print '=== need waiting...' | |
| init_manager.wait_all_complete() | |
| #显示所有可下载课件 | |
| show_files() | |
| print '=== files get over! Use %f seconds'%(time.time() - stime) | |
| #输入要下载的文件名,多个文件空格隔开 | |
| file_names = raw_input('input download filenames:').split() | |
| file_types = '' | |
| if not file_names: | |
| #输入要下载的文件格式类型,多个格式空格隔开 | |
| file_types = raw_input('input download filetypes:').split() | |
| start = time.time() | |
| print '=== downloading files...' | |
| #文件名和格式都为空则下载所有课件 | |
| download_manager = DownloadManager(file_urls_list, file_types, file_names) | |
| download_manager.wait_all_complete() | |
| print '=== OK! Use %f seconds!'%(time.time() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment