axiaoxin · August 29, 2015 14:19
diff --git a/download_teacher_files.py b/download_teacher_files.py
 # !/usr/bin/env python  
 # -*- coding:utf-8 -*-

 import urllib2
 import cookielib
 import re
 import urllib
 import os
 import Queue
 import threading
 import time


 jar = cookielib.CookieJar()
 handler = urllib2.HTTPCookieProcessor(jar)
 opener = urllib2.build_opener(handler)
 urllib2.install_opener(opener)

 dir_urls_set = set()
 file_urls_list = []

 def get_id_and_name_list():
    mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp'
    mainpage_content = urllib2.urlopen(mainpage_url).read()
    pattern = re.compile(r'ID=(.*?)>(.*?)<')
    #提取所有老师ID
    id_and_name_list = re.finditer(pattern, mainpage_content)
    return id_and_name_list

 def get_id_by_name(name):
    mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp'
    mainpage_content = urllib2.urlopen(mainpage_url).read()
    pattern = re.compile(r'ID=(.*?)>(%s)<'%name)
    id = re.search(pattern, mainpage_content).group(1).split("ID=")[-1]
    return id

 def login(id, pwd):
    url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp?txtId=%s&txtMM=%s'%(id, pwd)
    request = urllib2.Request(url)
    page = opener.open(request).read()
    if 'alert' in page:
        print 'password error'
    return page

 #获取页面中文件夹的地址
 def get_dir_urls(page):
    pattern = re.compile(r'(?P<dir_url>fileman\.asp\?Type=Js&ID=(?P<id>.+)&dir=%5C(?P<dir_path>.+))>(?P<dir_name>.+)</a>&nbsp;?')
    dir_urls = re.findall(pattern, page)
    if dir_urls:
        return dir_urls
    return []
    
 #获取页面中文件的地址
 def add_file_urls(page):
    pattern = re.compile(r"(?P<file_url>fileman.asp\?Type=Js&ID=(?P<id>.+)&a=dl&f=%5C(?P<file_name>.+%2E(?P<file_type>.+?)))>")
    file_urls = re.findall(pattern, page)
    file_urls_list.extend(file_urls)

 def download(file_url, save_path):
    teacher_name = ''
    global names, ids
    for name, id in zip(names, ids):
        if teacher_name:
            continue
        if id == file_url[1]:
            teacher_name = name
    save_path = '%s/%s'%(save_path, teacher_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    data = opener.open('http://wlcc.cuit.edu.cn/Share/%s'%(file_url[0])).read()
    file_name = urllib.unquote(file_url[2]).split('\\')[-1]
    f = open('%s/%s'%(save_path, file_name), 'wb')
    f.write(data)
    f.close()
    print '%s:%s is downloaded!'%(threading.currentThread().getName(), file_name)

 def download_files(file_url, file_type='', file_name='', save_path='.'):
    #根据文件名下载
    if file_name and not file_type:
        if urllib.unquote(file_url[2]).split('\\')[-1] == file_name:
            download(file_url, save_path)
    #根据文件格式下载
    if file_type and not file_name:
        if file_url[3] == file_type:
            download(file_url, save_path)
    #下载全部
    if not file_name and not file_type:
        download(file_url, save_path)

 #输入老师名字，为空则检验所有使用某密码的老湿
 names = raw_input('teacher:').split()
 pwd = raw_input('password:')
 ids = []
 pages = []


 #验证密码
 def do_check(name, id, pwd):
    #print u'%s:正在验证%s的密码'%(threading.currentThread().getName(), name.decode('gbk')) 
    check_login_url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp'
    post_datas = "txtId=%s&txtMM=%s"%(id, pwd)
    request = urllib2.Request(url = check_login_url, data = post_datas)
    content = opener.open(request).read()
    #密码错误会有一个alert，没有则密码正确进入了页面
    global ids, names, pages
    if 'alert' not in content:
        names.append(name)
        ids.append(id)
        pages.append(content)
        
 #线程池
 class CheckManager(object):
    def __init__(self, id_and_name_list, pwd, thread_num = 5):
        self.pwd = pwd
        self.check_queue = Queue.Queue()
        self.threads = []
        #按数量将要处理的数据和处理方法加入工作队列
        self.__init_check_queue(id_and_name_list)
        #按数量将启动的线程加入线程池中
        self.__init_thread_pool(thread_num)

    def __init_check_queue(self, the_list):
        for item in the_list:
            #将处理函数和参数元组加入队列，以便Check线程中重新调用
            self.check_queue.put((do_check, (item.group(2), item.group(1), self.pwd)))

    def __init_thread_pool(self, thread_num):
        for i in range(thread_num):
            #线程自启动并加入线程列表
            self.threads.append(Check(self.check_queue))

    #等待所有线程运行完毕
    def wait_all_complete(self):
        for t in self.threads:
            if t.isAlive():
                t.join()

 class Check(threading.Thread):
    def __init__(self, check_queue):
        threading.Thread.__init__(self)
        self.check_queue = check_queue
        self.start()

    def run(self):
        while True:
            try:
                #取出函数对象和对应的每次不同的参数
                check, args = self.check_queue.get(block=False) #block为True调用者将阻塞，直到队列出现可用的空闲，为False时，队列空时引发异常。
                #重新调用
                check(*args)
                self.check_queue.task_done()
            except:
                break


 print '=== login checking...'
 if names:
    for name in names:
        #根据输入的名字获取对应的id
        id = get_id_by_name(name)
        #登录课件页，返回页面内容
        page = login(id, pwd)
        ids.append(id)
        pages.append(page)
 else:
    id_and_name_list = get_id_and_name_list()
    check_manager = CheckManager(id_and_name_list, pwd)
    check_manager.wait_all_complete()
 print '=== check over!'

 #初始化文件夹地址列表
 def init_dirs_list(page):
    dir_urls_set.update(get_dir_urls(page))    
    for dir_url in list(dir_urls_set):
        if u'学生上传作业'.encode('gbk') not in urllib.unquote(dir_url[0]):
            request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0])
            page = opener.open(request).read()
            dir_urls_set.update(get_dir_urls(page))

 #根据文件夹地址列表遍历初始化文件地址列表
 def init_files_list(page):
    add_file_urls(page)
    init_dirs_list(page)
    
    for dir_url in list(dir_urls_set):
        request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0])
        page = opener.open(request).read()
        add_file_urls(page)

 class InitFilesListManager(object):
    def __init__(self, pages, thread_num = 5):
        self.init_queue = Queue.Queue()
        self.threads = []
        self.__init_queue(pages)
        self.__init_thread_pool(thread_num)

    def __init_queue(self, pages):
        for page in pages:
            self.init_queue.put((init_files_list, page))

    def __init_thread_pool(self, thread_num):
        for i in range(thread_num):
            self.threads.append(Init(self.init_queue))

    def wait_all_complete(self):
        for t in self.threads:
            if t.isAlive():
                t.join()


 class Init(threading.Thread):
    def __init__(self, init_queue):
        threading.Thread.__init__(self)
        self.init_queue = init_queue
        self.start()

    def run(self):
        while True:
            try:
                do, arg = self.init_queue.get(block=False)
                do(arg)
                self.init_queue.task_done()
            except Exception, e:
                #print e
                break



 def show_files():
    global file_urls_list
    file_urls_list = sorted(list(set(file_urls_list)))
    global names, ids
    teacher_name = ''
    #转换名字列表为字符串作为保存信息的文件名
    save_file_name = '-'.join([str(x) for x in names])#str(names).replace("'", '').decode('gbk')
    #将信息保存到文件
    file_infos = open(u'./教师课件信息[%s].txt'%save_file_name.decode('gbk'), 'w')
    for i in file_urls_list:
        for name, id in zip(names, ids):
            if i[1] == id:
                teacher_name = name
        file_info = '%s : %s : %s\n'%(teacher_name, pwd, urllib.unquote(i[2]).split('\\')[-1])
        print file_info,
        file_infos.write(file_info)
    file_infos.close()

    print u'=== 共%d个课件'%len(file_urls_list)


 #下载线程池，感觉和单线程的效率差不多，还不清楚原因
 class DownloadManager(object):
    #默认开启五个线程，保存路径为当前脚本目录下的老师名字文件夹内
    def __init__(self, file_urls_list, file_types, file_names, thread_num = 10, save_path='./downloads'):
        self.download_queue = Queue.Queue()
        self.threads = []
        self.file_urls_list = file_urls_list
        self.file_types = file_types
        self.file_names = file_names
        self.save_path = save_path
        self.__init_download_queue(file_urls_list)
        self.__init_thread_pool(thread_num)

    def __init_download_queue(self, the_list):
        if self.file_types:
            for item in the_list:
                for file_type in self.file_types:
                    self.download_queue.put((download_files, (item, file_type, '', self.save_path)))
        if self.file_names:
            for item in the_list:
                for file_name in self.file_names:
                    self.download_queue.put((download_files, (item, '', file_name, self.save_path)))
        if not self.file_names and not self.file_types:
            for item in the_list:
                self.download_queue.put((download_files, (item, '', '', self.save_path)))

    def __init_thread_pool(self, thread_num):
        for i in range(thread_num):
            self.threads.append(Download(self.download_queue))

    def wait_all_complete(self):
        for t in self.threads:
            if t.isAlive():
                t.join()


 class Download(threading.Thread):
    def __init__(self, download_queue):
        threading.Thread.__init__(self)
        self.download_queue = download_queue
        self.start()

    def run(self):
        while True:
            try:
                do, args = self.download_queue.get(block=False)
                do(*args)
                self.download_queue.task_done()
            except Exception, e:
                #print e
                break


 #初始化给定老湿的所有课件
 stime = time.time()
 init_manager = InitFilesListManager(pages)
 print '=== getting files list...'
 print '=== need waiting...'
 init_manager.wait_all_complete()
 #显示所有可下载课件
 show_files()
 print '=== files get over! Use %f seconds'%(time.time() - stime)
 #输入要下载的文件名，多个文件空格隔开
 file_names = raw_input('input download filenames:').split()
 file_types = ''
 if not file_names:
    #输入要下载的文件格式类型，多个格式空格隔开
    file_types = raw_input('input download filetypes:').split()

 start = time.time()

 print '=== downloading files...'
 #文件名和格式都为空则下载所有课件
 download_manager = DownloadManager(file_urls_list, file_types, file_names)
 download_manager.wait_all_complete()
 print '=== OK! Use %f seconds!'%(time.time() - start)
	# !/usr/bin/env python
	# -- coding:utf-8 --

	import urllib2
	import cookielib
	import re
	import urllib
	import os
	import Queue
	import threading
	import time


	jar = cookielib.CookieJar()
	handler = urllib2.HTTPCookieProcessor(jar)
	opener = urllib2.build_opener(handler)
	urllib2.install_opener(opener)

	dir_urls_set = set()
	file_urls_list = []

	def get_id_and_name_list():
	mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp'
	mainpage_content = urllib2.urlopen(mainpage_url).read()
	pattern = re.compile(r'ID=(.?)>(.?)<')
	#提取所有老师ID
	id_and_name_list = re.finditer(pattern, mainpage_content)
	return id_and_name_list

	def get_id_by_name(name):
	mainpage_url = 'http://wlcc.cuit.edu.cn/ShJsMd.asp'
	mainpage_content = urllib2.urlopen(mainpage_url).read()
	pattern = re.compile(r'ID=(.*?)>(%s)<'%name)
	id = re.search(pattern, mainpage_content).group(1).split("ID=")[-1]
	return id

	def login(id, pwd):
	url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp?txtId=%s&txtMM=%s'%(id, pwd)
	request = urllib2.Request(url)
	page = opener.open(request).read()
	if 'alert' in page:
	print 'password error'
	return page

	#获取页面中文件夹的地址
	def get_dir_urls(page):
	pattern = re.compile(r'(?P<dir_url>fileman\.asp\?Type=Js&ID=(?P<id>.+)&dir=%5C(?P<dir_path>.+))>(?P<dir_name>.+)</a> ?')
	dir_urls = re.findall(pattern, page)
	if dir_urls:
	return dir_urls
	return []

	#获取页面中文件的地址
	def add_file_urls(page):
	pattern = re.compile(r"(?P<file_url>fileman.asp\?Type=Js&ID=(?P<id>.+)&a=dl&f=%5C(?P<file_name>.+%2E(?P<file_type>.+?)))>")
	file_urls = re.findall(pattern, page)
	file_urls_list.extend(file_urls)

	def download(file_url, save_path):
	teacher_name = ''
	global names, ids
	for name, id in zip(names, ids):
	if teacher_name:
	continue
	if id == file_url[1]:
	teacher_name = name
	save_path = '%s/%s'%(save_path, teacher_name)
	if not os.path.exists(save_path):
	os.makedirs(save_path)
	data = opener.open('http://wlcc.cuit.edu.cn/Share/%s'%(file_url[0])).read()
	file_name = urllib.unquote(file_url[2]).split('\\')[-1]
	f = open('%s/%s'%(save_path, file_name), 'wb')
	f.write(data)
	f.close()
	print '%s:%s is downloaded!'%(threading.currentThread().getName(), file_name)

	def download_files(file_url, file_type='', file_name='', save_path='.'):
	#根据文件名下载
	if file_name and not file_type:
	if urllib.unquote(file_url[2]).split('\\')[-1] == file_name:
	download(file_url, save_path)
	#根据文件格式下载
	if file_type and not file_name:
	if file_url[3] == file_type:
	download(file_url, save_path)
	#下载全部
	if not file_name and not file_type:
	download(file_url, save_path)

	#输入老师名字，为空则检验所有使用某密码的老湿
	names = raw_input('teacher:').split()
	pwd = raw_input('password:')
	ids = []
	pages = []


	#验证密码
	def do_check(name, id, pwd):
	#print u'%s:正在验证%s的密码'%(threading.currentThread().getName(), name.decode('gbk'))
	check_login_url = 'http://wlcc.cuit.edu.cn/Share/ChkLgn.asp'
	post_datas = "txtId=%s&txtMM=%s"%(id, pwd)
	request = urllib2.Request(url = check_login_url, data = post_datas)
	content = opener.open(request).read()
	#密码错误会有一个alert，没有则密码正确进入了页面
	global ids, names, pages
	if 'alert' not in content:
	names.append(name)
	ids.append(id)
	pages.append(content)

	#线程池
	class CheckManager(object):
	def __init__(self, id_and_name_list, pwd, thread_num = 5):
	self.pwd = pwd
	self.check_queue = Queue.Queue()
	self.threads = []
	#按数量将要处理的数据和处理方法加入工作队列
	self.__init_check_queue(id_and_name_list)
	#按数量将启动的线程加入线程池中
	self.__init_thread_pool(thread_num)

	def __init_check_queue(self, the_list):
	for item in the_list:
	#将处理函数和参数元组加入队列，以便Check线程中重新调用
	self.check_queue.put((do_check, (item.group(2), item.group(1), self.pwd)))

	def __init_thread_pool(self, thread_num):
	for i in range(thread_num):
	#线程自启动并加入线程列表
	self.threads.append(Check(self.check_queue))

	#等待所有线程运行完毕
	def wait_all_complete(self):
	for t in self.threads:
	if t.isAlive():
	t.join()

	class Check(threading.Thread):
	def __init__(self, check_queue):
	threading.Thread.__init__(self)
	self.check_queue = check_queue
	self.start()

	def run(self):
	while True:
	try:
	#取出函数对象和对应的每次不同的参数
	check, args = self.check_queue.get(block=False) #block为True调用者将阻塞，直到队列出现可用的空闲，为False时，队列空时引发异常。
	#重新调用
	check(*args)
	self.check_queue.task_done()
	except:
	break


	print '=== login checking...'
	if names:
	for name in names:
	#根据输入的名字获取对应的id
	id = get_id_by_name(name)
	#登录课件页，返回页面内容
	page = login(id, pwd)
	ids.append(id)
	pages.append(page)
	else:
	id_and_name_list = get_id_and_name_list()
	check_manager = CheckManager(id_and_name_list, pwd)
	check_manager.wait_all_complete()
	print '=== check over!'

	#初始化文件夹地址列表
	def init_dirs_list(page):
	dir_urls_set.update(get_dir_urls(page))
	for dir_url in list(dir_urls_set):
	if u'学生上传作业'.encode('gbk') not in urllib.unquote(dir_url[0]):
	request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0])
	page = opener.open(request).read()
	dir_urls_set.update(get_dir_urls(page))

	#根据文件夹地址列表遍历初始化文件地址列表
	def init_files_list(page):
	add_file_urls(page)
	init_dirs_list(page)

	for dir_url in list(dir_urls_set):
	request = urllib2.Request('http://wlcc.cuit.edu.cn/share/%s'%dir_url[0])
	page = opener.open(request).read()
	add_file_urls(page)

	class InitFilesListManager(object):
	def __init__(self, pages, thread_num = 5):
	self.init_queue = Queue.Queue()
	self.threads = []
	self.__init_queue(pages)
	self.__init_thread_pool(thread_num)

	def __init_queue(self, pages):
	for page in pages:
	self.init_queue.put((init_files_list, page))

	def __init_thread_pool(self, thread_num):
	for i in range(thread_num):
	self.threads.append(Init(self.init_queue))

	def wait_all_complete(self):
	for t in self.threads:
	if t.isAlive():
	t.join()


	class Init(threading.Thread):
	def __init__(self, init_queue):
	threading.Thread.__init__(self)
	self.init_queue = init_queue
	self.start()

	def run(self):
	while True:
	try:
	do, arg = self.init_queue.get(block=False)
	do(arg)
	self.init_queue.task_done()
	except Exception, e:
	#print e
	break



	def show_files():
	global file_urls_list
	file_urls_list = sorted(list(set(file_urls_list)))
	global names, ids
	teacher_name = ''
	#转换名字列表为字符串作为保存信息的文件名
	save_file_name = '-'.join([str(x) for x in names])#str(names).replace("'", '').decode('gbk')
	#将信息保存到文件
	file_infos = open(u'./教师课件信息[%s].txt'%save_file_name.decode('gbk'), 'w')
	for i in file_urls_list:
	for name, id in zip(names, ids):
	if i[1] == id:
	teacher_name = name
	file_info = '%s : %s : %s\n'%(teacher_name, pwd, urllib.unquote(i[2]).split('\\')[-1])
	print file_info,
	file_infos.write(file_info)
	file_infos.close()

	print u'=== 共%d个课件'%len(file_urls_list)


	#下载线程池，感觉和单线程的效率差不多，还不清楚原因
	class DownloadManager(object):
	#默认开启五个线程，保存路径为当前脚本目录下的老师名字文件夹内
	def __init__(self, file_urls_list, file_types, file_names, thread_num = 10, save_path='./downloads'):
	self.download_queue = Queue.Queue()
	self.threads = []
	self.file_urls_list = file_urls_list
	self.file_types = file_types
	self.file_names = file_names
	self.save_path = save_path
	self.__init_download_queue(file_urls_list)
	self.__init_thread_pool(thread_num)

	def __init_download_queue(self, the_list):
	if self.file_types:
	for item in the_list:
	for file_type in self.file_types:
	self.download_queue.put((download_files, (item, file_type, '', self.save_path)))
	if self.file_names:
	for item in the_list:
	for file_name in self.file_names:
	self.download_queue.put((download_files, (item, '', file_name, self.save_path)))
	if not self.file_names and not self.file_types:
	for item in the_list:
	self.download_queue.put((download_files, (item, '', '', self.save_path)))

	def __init_thread_pool(self, thread_num):
	for i in range(thread_num):
	self.threads.append(Download(self.download_queue))

	def wait_all_complete(self):
	for t in self.threads:
	if t.isAlive():
	t.join()


	class Download(threading.Thread):
	def __init__(self, download_queue):
	threading.Thread.__init__(self)
	self.download_queue = download_queue
	self.start()

	def run(self):
	while True:
	try:
	do, args = self.download_queue.get(block=False)
	do(*args)
	self.download_queue.task_done()
	except Exception, e:
	#print e
	break


	#初始化给定老湿的所有课件
	stime = time.time()
	init_manager = InitFilesListManager(pages)
	print '=== getting files list...'
	print '=== need waiting...'
	init_manager.wait_all_complete()
	#显示所有可下载课件
	show_files()
	print '=== files get over! Use %f seconds'%(time.time() - stime)
	#输入要下载的文件名，多个文件空格隔开
	file_names = raw_input('input download filenames:').split()
	file_types = ''
	if not file_names:
	#输入要下载的文件格式类型，多个格式空格隔开
	file_types = raw_input('input download filetypes:').split()

	start = time.time()

	print '=== downloading files...'
	#文件名和格式都为空则下载所有课件
	download_manager = DownloadManager(file_urls_list, file_types, file_names)
	download_manager.wait_all_complete()
	print '=== OK! Use %f seconds!'%(time.time() - start)
No results found