Skip to content

Instantly share code, notes, and snippets.

@axiaoxin
Last active August 29, 2015 14:19
Show Gist options
  • Select an option

  • Save axiaoxin/178a9d499c39b1c348c9 to your computer and use it in GitHub Desktop.

Select an option

Save axiaoxin/178a9d499c39b1c348c9 to your computer and use it in GitHub Desktop.
download_teacher_files.py
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib2
import cookielib
import re
import urllib
import os
import Queue
import threading
import time
jar = cookielib.CookieJar()
handler = urllib2.HTTPCookieProcessor(jar)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
dir_urls_set = set()
file_urls_list = []
def get_id_and_name_list():
mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp'
mainpage_content = urllib2.urlopen(mainpage_url).read()
pattern = re.compile(r'ID=(.*?)>(.*?)<')
#提取所有老师ID
id_and_name_list = re.finditer(pattern, mainpage_content)
return id_and_name_list
def get_id_by_name(name):
mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp'
mainpage_content = urllib2.urlopen(mainpage_url).read()
pattern = re.compile(r'ID=(.*?)>(%s)<'%name)
id = re.search(pattern, mainpage_content).group(1).split("ID=")[-1]
return id
def login(id, pwd):
url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp?txtId=%s&txtMM=%s'%(id, pwd)
request = urllib2.Request(url)
page = opener.open(request).read()
if 'alert' in page:
print 'password error'
return page
#获取页面中文件夹的地址
def get_dir_urls(page):
pattern = re.compile(r'(?P<dir_url>fileman\.asp\?Type=Js&ID=(?P<id>.+)&dir=%5C(?P<dir_path>.+))>(?P<dir_name>.+)</a>&nbsp;?')
dir_urls = re.findall(pattern, page)
if dir_urls:
return dir_urls
return []
#获取页面中文件的地址
def add_file_urls(page):
pattern = re.compile(r"(?P<file_url>fileman.asp\?Type=Js&ID=(?P<id>.+)&a=dl&f=%5C(?P<file_name>.+%2E(?P<file_type>.+?)))>")
file_urls = re.findall(pattern, page)
file_urls_list.extend(file_urls)
def download(file_url, save_path):
teacher_name = ''
global names, ids
for name, id in zip(names, ids):
if teacher_name:
continue
if id == file_url[1]:
teacher_name = name
save_path = '%s/%s'%(save_path, teacher_name)
if not os.path.exists(save_path):
os.makedirs(save_path)
data = opener.open('http://wlcc.cuit.edu.cn/Share/%s'%(file_url[0])).read()
file_name = urllib.unquote(file_url[2]).split('\\')[-1]
f = open('%s/%s'%(save_path, file_name), 'wb')
f.write(data)
f.close()
print '%s:%s is downloaded!'%(threading.currentThread().getName(), file_name)
def download_files(file_url, file_type='', file_name='', save_path='.'):
#根据文件名下载
if file_name and not file_type:
if urllib.unquote(file_url[2]).split('\\')[-1] == file_name:
download(file_url, save_path)
#根据文件格式下载
if file_type and not file_name:
if file_url[3] == file_type:
download(file_url, save_path)
#下载全部
if not file_name and not file_type:
download(file_url, save_path)
#输入老师名字,为空则检验所有使用某密码的老湿
names = raw_input('teacher:').split()
pwd = raw_input('password:')
ids = []
pages = []
#验证密码
def do_check(name, id, pwd):
#print u'%s:正在验证%s的密码'%(threading.currentThread().getName(), name.decode('gbk'))
check_login_url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp'
post_datas = "txtId=%s&txtMM=%s"%(id, pwd)
request = urllib2.Request(url = check_login_url, data = post_datas)
content = opener.open(request).read()
#密码错误会有一个alert,没有则密码正确进入了页面
global ids, names, pages
if 'alert' not in content:
names.append(name)
ids.append(id)
pages.append(content)
#线程池
class CheckManager(object):
def __init__(self, id_and_name_list, pwd, thread_num = 5):
self.pwd = pwd
self.check_queue = Queue.Queue()
self.threads = []
#按数量将要处理的数据和处理方法加入工作队列
self.__init_check_queue(id_and_name_list)
#按数量将启动的线程加入线程池中
self.__init_thread_pool(thread_num)
def __init_check_queue(self, the_list):
for item in the_list:
#将处理函数和参数元组加入队列,以便Check线程中重新调用
self.check_queue.put((do_check, (item.group(2), item.group(1), self.pwd)))
def __init_thread_pool(self, thread_num):
for i in range(thread_num):
#线程自启动并加入线程列表
self.threads.append(Check(self.check_queue))
#等待所有线程运行完毕
def wait_all_complete(self):
for t in self.threads:
if t.isAlive():
t.join()
class Check(threading.Thread):
def __init__(self, check_queue):
threading.Thread.__init__(self)
self.check_queue = check_queue
self.start()
def run(self):
while True:
try:
#取出函数对象和对应的每次不同的参数
check, args = self.check_queue.get(block=False) #block为True调用者将阻塞,直到队列出现可用的空闲,为False时,队列空时引发异常。
#重新调用
check(*args)
self.check_queue.task_done()
except:
break
print '=== login checking...'
if names:
for name in names:
#根据输入的名字获取对应的id
id = get_id_by_name(name)
#登录课件页,返回页面内容
page = login(id, pwd)
ids.append(id)
pages.append(page)
else:
id_and_name_list = get_id_and_name_list()
check_manager = CheckManager(id_and_name_list, pwd)
check_manager.wait_all_complete()
print '=== check over!'
#初始化文件夹地址列表
def init_dirs_list(page):
dir_urls_set.update(get_dir_urls(page))
for dir_url in list(dir_urls_set):
if u'学生上传作业'.encode('gbk') not in urllib.unquote(dir_url[0]):
request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0])
page = opener.open(request).read()
dir_urls_set.update(get_dir_urls(page))
#根据文件夹地址列表遍历初始化文件地址列表
def init_files_list(page):
add_file_urls(page)
init_dirs_list(page)
for dir_url in list(dir_urls_set):
request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0])
page = opener.open(request).read()
add_file_urls(page)
class InitFilesListManager(object):
def __init__(self, pages, thread_num = 5):
self.init_queue = Queue.Queue()
self.threads = []
self.__init_queue(pages)
self.__init_thread_pool(thread_num)
def __init_queue(self, pages):
for page in pages:
self.init_queue.put((init_files_list, page))
def __init_thread_pool(self, thread_num):
for i in range(thread_num):
self.threads.append(Init(self.init_queue))
def wait_all_complete(self):
for t in self.threads:
if t.isAlive():
t.join()
class Init(threading.Thread):
def __init__(self, init_queue):
threading.Thread.__init__(self)
self.init_queue = init_queue
self.start()
def run(self):
while True:
try:
do, arg = self.init_queue.get(block=False)
do(arg)
self.init_queue.task_done()
except Exception, e:
#print e
break
def show_files():
global file_urls_list
file_urls_list = sorted(list(set(file_urls_list)))
global names, ids
teacher_name = ''
#转换名字列表为字符串作为保存信息的文件名
save_file_name = '-'.join([str(x) for x in names])#str(names).replace("'", '').decode('gbk')
#将信息保存到文件
file_infos = open(u'./教师课件信息[%s].txt'%save_file_name.decode('gbk'), 'w')
for i in file_urls_list:
for name, id in zip(names, ids):
if i[1] == id:
teacher_name = name
file_info = '%s : %s : %s\n'%(teacher_name, pwd, urllib.unquote(i[2]).split('\\')[-1])
print file_info,
file_infos.write(file_info)
file_infos.close()
print u'=== 共%d个课件'%len(file_urls_list)
#下载线程池,感觉和单线程的效率差不多,还不清楚原因
class DownloadManager(object):
#默认开启五个线程,保存路径为当前脚本目录下的老师名字文件夹内
def __init__(self, file_urls_list, file_types, file_names, thread_num = 10, save_path='./downloads'):
self.download_queue = Queue.Queue()
self.threads = []
self.file_urls_list = file_urls_list
self.file_types = file_types
self.file_names = file_names
self.save_path = save_path
self.__init_download_queue(file_urls_list)
self.__init_thread_pool(thread_num)
def __init_download_queue(self, the_list):
if self.file_types:
for item in the_list:
for file_type in self.file_types:
self.download_queue.put((download_files, (item, file_type, '', self.save_path)))
if self.file_names:
for item in the_list:
for file_name in self.file_names:
self.download_queue.put((download_files, (item, '', file_name, self.save_path)))
if not self.file_names and not self.file_types:
for item in the_list:
self.download_queue.put((download_files, (item, '', '', self.save_path)))
def __init_thread_pool(self, thread_num):
for i in range(thread_num):
self.threads.append(Download(self.download_queue))
def wait_all_complete(self):
for t in self.threads:
if t.isAlive():
t.join()
class Download(threading.Thread):
def __init__(self, download_queue):
threading.Thread.__init__(self)
self.download_queue = download_queue
self.start()
def run(self):
while True:
try:
do, args = self.download_queue.get(block=False)
do(*args)
self.download_queue.task_done()
except Exception, e:
#print e
break
#初始化给定老湿的所有课件
stime = time.time()
init_manager = InitFilesListManager(pages)
print '=== getting files list...'
print '=== need waiting...'
init_manager.wait_all_complete()
#显示所有可下载课件
show_files()
print '=== files get over! Use %f seconds'%(time.time() - stime)
#输入要下载的文件名,多个文件空格隔开
file_names = raw_input('input download filenames:').split()
file_types = ''
if not file_names:
#输入要下载的文件格式类型,多个格式空格隔开
file_types = raw_input('input download filetypes:').split()
start = time.time()
print '=== downloading files...'
#文件名和格式都为空则下载所有课件
download_manager = DownloadManager(file_urls_list, file_types, file_names)
download_manager.wait_all_complete()
print '=== OK! Use %f seconds!'%(time.time() - start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment