Skip to content

Instantly share code, notes, and snippets.

@papadave66
Last active June 7, 2019 05:49
Show Gist options
  • Save papadave66/e1368bc3940b6239b01a8c8258923fef to your computer and use it in GitHub Desktop.
Save papadave66/e1368bc3940b6239b01a8c8258923fef to your computer and use it in GitHub Desktop.
An answer spider for www.neumooc.com
import urllib
import urllib.request
from urllib import request,parse
import requests
import re
import getpass
import json
courseId="3B0BF5AFB336476F815E3934181F1DA2"
print("正在初始化...")
session=requests.session()
res=session.get("http://www.neumooc.com/login/login")
cookies=requests.utils.dict_from_cookiejar(res.cookies)
referer="http://www.neumooc.com/course/play/init?courseId="+courseId
headers={
"Host": "www.neumooc.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"DNT": "1",
"Referer": referer,
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": "JSESSIONID="+cookies["JSESSIONID"]
}
session.headers=headers
username=input("请输入你的学号:")
psword=getpass.getpass("请输入你的密码(不会显示,正常输入后回车即可):")
yz="22"
#模拟人工输入验证码过程
#body="captchaCode="+yz
#body=bytes(body,encoding="utf-8")
#url="http://www.neumooc.com/login/captchaCodeCheck"
#res=session.post(url,body)
body="userRequestUrl=&userName="+username+"&password="+psword+"&captchaCode="+yz
body=bytes(body,encoding="utf-8")
url="http://www.neumooc.com/login/checkLogin"
res=session.post(url,body)
cookies=requests.utils.dict_from_cookiejar(res.cookies)
#os.system("cls")
#登录完成
#print("欢迎"+"登录!如遇到问题请寻找开发者解决")
headers["Cookie"]="JSESSIONID="+cookies["JSESSIONID"]+";uid="+cookies["uid"]+";"
url="http://www.neumooc.com/course/play/init?courseId="+courseId
req=request.Request(url=url,headers=headers,method="GET")
try:
response=request.urlopen(req)
res=response.read().decode('utf-8')
response.close()
except:
print("无法初始化,请检查网络设置")
response.close()
time.sleep(20)
close()
course=re.compile('class="childLi outl_(.*?)"')
test=re.compile('showTest\(this, ., \'(.*?)\'\)')
title=re.compile('none;">(.*?)</span>')
tittle=title.findall(res)#索引记录
outlineId_grup=course.findall(res)#找出outlineID
testqueslistfind=re.compile('{"resInfo":{"testQuesList":\[]')
resId_grup=[]
response.close()
e=0
while e <= len(outlineId_grup):
#初始化页面信息
e+=1
if(e==len(outlineId_grup)):
break
outlineId=outlineId_grup[e]
url="http://www.neumooc.com/course/play/init?courseId="+courseId+"&outlineId="+str(outlineId)
req=request.Request(url=url,headers=headers,method="GET")
try:
response=request.urlopen(req)#获取页面信息
res=response.read().decode('utf-8')#获取到源码
except:
print("初始化异常,即将重试该题")
e-=1
response.close()
continue
response.close()
test=re.compile('showTest\(this, ., \'(.*?)\'\)')#考试按钮的resid获取
resid=test.findall(res)#获取resid
resId_grup+=resid
#print(resid)
temp=test.findall(res)
if len(temp) is 0:
# print(tittle[e+1],"该题无测试\n\n")
resId_grup+=' '
continue
#resid,courseid,outlineid获取完成
# time.sleep(1)
#print('outlineId')
#print(outlineId_grup)
#print(len(outlineId_grup))
#print('resId')
#print(resId_grup)
#print(len(resId_grup))
req_answer_url='http://www.neumooc.com/course/play/getOutlineResInfo'
for i in range(len(resId_grup)):
referer='http://www.neumooc.com/course/play/init?courseId='+courseId+'&outlineId='+outlineId_grup[i]
req_answer_headers={
"Host": "www.neumooc.com",
"Connection": "keep-alive",
"Accept-Encoding": "gzip, deflate",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"Accept": "*/*",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Referer": referer,
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"X-Requested-With": "XMLHttpRequest",
"Cookie": "JSESSIONID="+cookies["JSESSIONID"]+";uid="+cookies["uid"]+";",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
}
data="resId="+resId_grup[i]+"&resType=2&outlineId="+outlineId_grup[i]+"&courseId="+courseId
# print(req_answer_headers)
# print(data)
data = bytes(data,encoding="utf-8")
req = request.Request(url=req_answer_url,data=data,headers=req_answer_headers,method="POST")
# print(req)
response = request.urlopen(req)#开始获取题目及答案
web=response.read().decode("utf-8")
# print(web)
try:
loaded_json=json.loads(web)
for question in loaded_json.get('resInfo').get('testQuesList'):
ques=question['quesContent']
keys=question['quesAnswer']
keys=keys.replace('<as>','')
keys=keys.replace('</as>','')
keys=keys.replace('</a>','')
keys=keys.replace('<a>','')
print(ques + '\t' + keys + '\n')
except:
continue
@papadave66
Copy link
Author

Q:how to use this?
A: python3 neumooc-spider.py | tee -a ANSWERS

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment