Skip to content

Instantly share code, notes, and snippets.

@WuXianglong
Created January 22, 2017 07:14
Show Gist options
  • Save WuXianglong/a4f976e67c62ceea4f0efa61f38702f4 to your computer and use it in GitHub Desktop.
Save WuXianglong/a4f976e67c62ceea4f0efa61f38702f4 to your computer and use it in GitHub Desktop.
爬虫
# -*- coding: utf-8 -*-
import bs4
import requests
BASE_URL = 'http://bjcme.haoyisheng.com/beijing_project/listProjectGongbu.jsp'
CONTENT_URL = 'http://bjcme.haoyisheng.com/beijing_project/projectGongbuList.do?d-1342871-p=1&beian=&orderBy=subject&pici=&pageSize=10&parentSubjectId=&sdanwei=&type=&xmanager=&gongbu=3&gongbuCode=&subjectId=&name=&scode=&year=2017'
TEACHER_URL = 'http://bjcme.haoyisheng.com/beijing_project/projectTeacherList.do?requestType=PRINT&id=%s'
def get_content():
resp = requests.get(CONTENT_URL)
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
headers = [cell.text for cell in soup.select_one('thead').find_all('th')]
print u'学科名称', '\t', '\t'.join(headers)
tbody_rows = soup.select_one('tbody').find_all('tr')
contents = []
for row in tbody_rows:
data = []
tds = row.find_all('td')
for i, cell in enumerate(tds):
if i == 0:
text = cell.text.replace('\r', '').replace('\n', '').replace('\t', '')
items = text.split('var subject')
text = items[0]
data.append(items[1].split('name:"')[1].split('"};')[0])
elif i == len(tds) - 1:
text = get_teachers(cell.select_one('button').attrs['projectid'])
elif i == 5:
items = cell.get_text('|', strip=True).split('|')
text = items[0] + items[1] + ' ' + items[2]
else:
text = cell.text.split('|')[0].replace('\r', '').replace('\n', '').replace('\t', '')
data.append(text)
print '\t'.join(data)
contents.append(data)
def get_teachers(projectid):
resp = requests.get(TEACHER_URL % projectid)
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
table_bodies = soup.select_one('tbody')
table_rows = table_bodies.find_all('tr')
teachers = []
for row in table_rows:
data = [cell.text for cell in row.find_all('td')]
teachers.append('_'.join(data))
return ','.join(teachers)
if __name__ == "__main__":
get_content()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment