Created
November 9, 2019 12:28
-
-
Save Sciroccogti/8bbbcdeccc818db868abfadfa7f0cb48 to your computer and use it in GitHub Desktop.
爬取东南大学旧课程表的小脚本,作为自己第一个投入实战的小爬虫,留个纪念罢了
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
import requests # 导入网页请求库 | |
import re # 导入正则表达式库 | |
import openpyxl | |
from bs4 import BeautifulSoup # 导入网页解析库 | |
name_pattern = r'姓名:(.*?)</td>' | |
name_list = [] | |
tongdian_pattern = '<font class="style8">通信电子线路</font>' | |
tongyuan_pattern = '<font class="style8">通信原理(双语)</font>' | |
dsp_pattern = '<font class="style8">数字信号处理</font>' | |
weibo_pattern = '<font class="style8">微波工程基础</font>' | |
coa_pattern = '<font class="style8">计算机组织与结构(双语)I</font>' | |
IDlist = list(range(4217701, 4217753)) | |
IDlist.extend(list(range(4218801, 4218834))) | |
for studentID in IDlist: | |
tongdian = tongyuan = dsp = weibo = coa = '' | |
# 传入URL | |
r = requests.get('http://xk.urp.seu.edu.cn/jw_service/service/stuCurriculum.action?queryStudentId=0' + str(studentID) +'&queryAcademicYear=19-20-2') | |
# 解析URL | |
soup = BeautifulSoup(r.text, 'html.parser') | |
content_list = soup.find_all('td', attrs = {'width': '20%', 'align': 'left'}) | |
for content in content_list: | |
# 匹配所有符合正则表达式的内容 | |
name = re.search(name_pattern, str(content)) | |
if name: | |
print(name.group(1), studentID) | |
lesson_list = soup.find_all('td', attrs={'height':'34', 'class':'line_topleft', 'width':'35%', 'align':'center'}) | |
for lesson in lesson_list: | |
if not tongdian and str(lesson).find(tongdian_pattern) > 0: | |
tongdian = '■' | |
if not tongyuan and str(lesson).find(tongyuan_pattern) > 0: | |
tongyuan = '■' | |
if not dsp and str(lesson).find(dsp_pattern) > 0: | |
dsp = '■' | |
if not weibo and str(lesson).find(weibo_pattern) > 0: | |
weibo = '■' | |
if not coa and str(lesson).find(coa_pattern) > 0: | |
coa = '■' | |
if tongdian or tongyuan or dsp or weibo or coa: | |
print([name.group(1), tongdian, tongyuan, dsp, weibo, coa]) | |
name_list.append([name.group(1), tongdian, tongyuan, dsp, weibo, coa]) | |
print('opening xlsx...') | |
# 新建一个工作薄 | |
wb = openpyxl.Workbook() | |
# 新建sheet页(可以插入到指定的索引处) | |
wb.create_sheet('list', 0) | |
ws = wb.worksheets[0] | |
ws.append(['姓名','通信电子线路','通信原理','数组信号处理','微波工程','计算机组织与结构']) | |
for row in name_list: | |
ws.append(row) | |
# 保存xlsx文件 | |
wb.save('list.xlsx') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment