Last active
June 7, 2019 05:49
-
-
Save papadave66/e1368bc3940b6239b01a8c8258923fef to your computer and use it in GitHub Desktop.
An answer spider for www.neumooc.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import urllib.request | |
from urllib import request,parse | |
import requests | |
import re | |
import getpass | |
import json | |
courseId="3B0BF5AFB336476F815E3934181F1DA2" | |
print("正在初始化...") | |
session=requests.session() | |
res=session.get("http://www.neumooc.com/login/login") | |
cookies=requests.utils.dict_from_cookiejar(res.cookies) | |
referer="http://www.neumooc.com/course/play/init?courseId="+courseId | |
headers={ | |
"Host": "www.neumooc.com", | |
"Connection": "keep-alive", | |
"Cache-Control": "max-age=0", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | |
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", | |
"DNT": "1", | |
"Referer": referer, | |
"Accept-Language": "zh-CN,zh;q=0.9", | |
"Cookie": "JSESSIONID="+cookies["JSESSIONID"] | |
} | |
session.headers=headers | |
username=input("请输入你的学号:") | |
psword=getpass.getpass("请输入你的密码(不会显示,正常输入后回车即可):") | |
yz="22" | |
#模拟人工输入验证码过程 | |
#body="captchaCode="+yz | |
#body=bytes(body,encoding="utf-8") | |
#url="http://www.neumooc.com/login/captchaCodeCheck" | |
#res=session.post(url,body) | |
body="userRequestUrl=&userName="+username+"&password="+psword+"&captchaCode="+yz | |
body=bytes(body,encoding="utf-8") | |
url="http://www.neumooc.com/login/checkLogin" | |
res=session.post(url,body) | |
cookies=requests.utils.dict_from_cookiejar(res.cookies) | |
#os.system("cls") | |
#登录完成 | |
#print("欢迎"+"登录!如遇到问题请寻找开发者解决") | |
headers["Cookie"]="JSESSIONID="+cookies["JSESSIONID"]+";uid="+cookies["uid"]+";" | |
url="http://www.neumooc.com/course/play/init?courseId="+courseId | |
req=request.Request(url=url,headers=headers,method="GET") | |
try: | |
response=request.urlopen(req) | |
res=response.read().decode('utf-8') | |
response.close() | |
except: | |
print("无法初始化,请检查网络设置") | |
response.close() | |
time.sleep(20) | |
close() | |
course=re.compile('class="childLi outl_(.*?)"') | |
test=re.compile('showTest\(this, ., \'(.*?)\'\)') | |
title=re.compile('none;">(.*?)</span>') | |
tittle=title.findall(res)#索引记录 | |
outlineId_grup=course.findall(res)#找出outlineID | |
testqueslistfind=re.compile('{"resInfo":{"testQuesList":\[]') | |
resId_grup=[] | |
response.close() | |
e=0 | |
while e <= len(outlineId_grup): | |
#初始化页面信息 | |
e+=1 | |
if(e==len(outlineId_grup)): | |
break | |
outlineId=outlineId_grup[e] | |
url="http://www.neumooc.com/course/play/init?courseId="+courseId+"&outlineId="+str(outlineId) | |
req=request.Request(url=url,headers=headers,method="GET") | |
try: | |
response=request.urlopen(req)#获取页面信息 | |
res=response.read().decode('utf-8')#获取到源码 | |
except: | |
print("初始化异常,即将重试该题") | |
e-=1 | |
response.close() | |
continue | |
response.close() | |
test=re.compile('showTest\(this, ., \'(.*?)\'\)')#考试按钮的resid获取 | |
resid=test.findall(res)#获取resid | |
resId_grup+=resid | |
#print(resid) | |
temp=test.findall(res) | |
if len(temp) is 0: | |
# print(tittle[e+1],"该题无测试\n\n") | |
resId_grup+=' ' | |
continue | |
#resid,courseid,outlineid获取完成 | |
# time.sleep(1) | |
#print('outlineId') | |
#print(outlineId_grup) | |
#print(len(outlineId_grup)) | |
#print('resId') | |
#print(resId_grup) | |
#print(len(resId_grup)) | |
req_answer_url='http://www.neumooc.com/course/play/getOutlineResInfo' | |
for i in range(len(resId_grup)): | |
referer='http://www.neumooc.com/course/play/init?courseId='+courseId+'&outlineId='+outlineId_grup[i] | |
req_answer_headers={ | |
"Host": "www.neumooc.com", | |
"Connection": "keep-alive", | |
"Accept-Encoding": "gzip, deflate", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", | |
"Accept": "*/*", | |
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", | |
"Referer": referer, | |
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", | |
"X-Requested-With": "XMLHttpRequest", | |
"Cookie": "JSESSIONID="+cookies["JSESSIONID"]+";uid="+cookies["uid"]+";", | |
"Pragma": "no-cache", | |
"Cache-Control": "no-cache" | |
} | |
data="resId="+resId_grup[i]+"&resType=2&outlineId="+outlineId_grup[i]+"&courseId="+courseId | |
# print(req_answer_headers) | |
# print(data) | |
data = bytes(data,encoding="utf-8") | |
req = request.Request(url=req_answer_url,data=data,headers=req_answer_headers,method="POST") | |
# print(req) | |
response = request.urlopen(req)#开始获取题目及答案 | |
web=response.read().decode("utf-8") | |
# print(web) | |
try: | |
loaded_json=json.loads(web) | |
for question in loaded_json.get('resInfo').get('testQuesList'): | |
ques=question['quesContent'] | |
keys=question['quesAnswer'] | |
keys=keys.replace('<as>','') | |
keys=keys.replace('</as>','') | |
keys=keys.replace('</a>','') | |
keys=keys.replace('<a>','') | |
print(ques + '\t' + keys + '\n') | |
except: | |
continue | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Q:how to use this?
A:
python3 neumooc-spider.py | tee -a ANSWERS