Last active
September 15, 2016 02:34
-
-
Save rschiang/02d620b063b5309db4bfcf799b2717c5 to your computer and use it in GitHub Desktop.
交換資料整理
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests | |
import re | |
def fetch(url): | |
if url.startswith('/'): | |
url = 'http://www.oia.ntu.edu.tw' + url | |
request = requests.get(url) | |
request.encoding = 'utf8' | |
return request.text | |
list_text = fetch('http://www.oia.ntu.edu.tw/ch/outgoing/school.list') | |
schools = re.findall(r'href="([^"]+)" target="_blank"\>申請資料\</a\>', list_text) | |
filters = [ | |
(r'\<[^>]+\>', ''), | |
(r'( |\n)', ' '), | |
(r'\s+', ' '), | |
#(r'[碩博][一二三四五六]/?', '+'), # Additional filters | |
#(r'\++', '+'), | |
#(r'.*?不接受.*法律學院.*', 'X'), | |
#(r'.*?歷年不得有不及格科目.*', 'S'), | |
#(r'(.)語組(.*)一般組', r'\1語組⋯⋯一般組'), | |
] | |
for school_url in schools: | |
school_text = fetch(school_url) | |
title = re.search(r"\<h1 class='forms_title'>([^\<]+)", school_text).group(1).strip() | |
cond = re.search(r"申請資格\</th\>(.+)\<th style='width:90px;'\>名額", school_text, re.DOTALL) | |
if cond: | |
cond_text = cond.group(1).strip() | |
for pattern, repl in filters: | |
cond_text = re.sub(pattern, repl, cond_text) | |
#if '語組' in cond_text and not '一般組' in cond_text and not '中語組' in cond_text: | |
# cond_text = 'L' | |
else: | |
cond = '(N/A)' | |
print(title, '\t', cond_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment