Last active
December 18, 2017 07:16
-
-
Save foriequal0/ae597fa1d8c262072c7174489e63e2c4 to your computer and use it in GitHub Desktop.
졸업프로젝트 크롤러
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import urllib3 | |
urllib3.disable_warnings() | |
HOST="" | |
LOGIN_PAYLOAD = { | |
"mode": "login", | |
"userid": "", | |
"passwd": "", | |
} | |
def login(s): | |
LOGIN_URL=HOST+"/member/login" | |
res = s.post(LOGIN_URL, data=LOGIN_PAYLOAD) | |
def get_html(s, url): | |
r = s.get(url) | |
if r.status_code == 200: | |
r.encoding='utf8' | |
return r.text | |
def get_list(s, year, term): | |
LISTING_URL=HOST+"/intranet/lecture/project?year={year}&term={term}" | |
html = get_html(s, LISTING_URL.format(year= year, term= term)) | |
soup = BeautifulSoup(html, 'html.parser') | |
table = soup.select("table.tbs-01.acenter")[0] | |
for row in table.find_all("tr"): | |
cols = row.find_all("td") | |
if len(cols) == 0: | |
continue | |
yield { | |
"year": int(cols[1].text), | |
"half": int(cols[2].text), | |
"id": cols[3].text, | |
"prof": cols[4].text, | |
"max": int(cols[5].text), | |
"title": cols[6].text, | |
} | |
def get_detail(s, row): | |
DETAIL_URL=HOST+"/intranet/lecture/project?mode=view&pjtid={id}" | |
html = get_html(s, DETAIL_URL.format(id= row["id"])) | |
soup = BeautifulSoup(html, 'html.parser') | |
table = soup.select("div.form_table")[0] | |
applied_r = soup.select("table.acenter")[0].find("tbody").find_all("tr"); | |
done_r = soup.select("table.acenter")[1].find("tbody").find_all("tr"); | |
if len(applied_r) == 1 and "등록된 자료가 없습니다." in applied_r[0].text: | |
applied = 0 | |
else: | |
applied = len(applied_r) | |
if len(done_r) == 1 and "등록된 자료가 없습니다." in done_r[0].text: | |
done = 0 | |
else: | |
done = len(done_r) | |
return { | |
**row, | |
"scope": soup.select("table > tbody")[0].find_all("tr")[2].find("td").text, | |
"require": soup.select("table > tbody")[0].find_all("tr")[4].find("td").text, | |
"body": soup.select("table.form_table3 > tbody > tr > td")[0].text, | |
"applied": applied, | |
"done": done, | |
} | |
projects = [] | |
with requests.Session() as s: | |
s.verify = False | |
login(s) | |
for (year, half) in [(2018, 1), (2017, 2), (2017, 1), (2016, 2), (2016, 1), (2015, 2), (2015, 1), (2014, 2)]: | |
print(year, half) | |
for row in get_list(s, year, half): | |
projects.append(get_detail(s, row)) | |
print(len(projects)) | |
# export as sqlite | |
import sqlite3 | |
conn = sqlite3.connect('grad_projects.sqlite') | |
c = conn.cursor() | |
c.execute('''DROP TABLE IF EXISTS projects''') | |
c.execute(''' | |
CREATE TABLE projects | |
(id TEXT, | |
year INTEGER, half INTEGER, | |
prof TEXT, scope TEXT, title TEXT, required TEXT, | |
max INTEGER, applied INTEGER, done INTEGER, | |
body TEXT) | |
''') | |
c.executemany('INSERT INTO projects VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', | |
[(p['id'], p['year'], p['half'], | |
p['prof'], p['scope'], p['title'], p['require'], | |
p['max'], p['applied'], p['done'], | |
p['body']) for p in projects]) | |
conn.commit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment