Skip to content

Instantly share code, notes, and snippets.

@rschiang
Last active September 15, 2016 02:34
Show Gist options
  • Save rschiang/02d620b063b5309db4bfcf799b2717c5 to your computer and use it in GitHub Desktop.
Save rschiang/02d620b063b5309db4bfcf799b2717c5 to your computer and use it in GitHub Desktop.
交換資料整理
#!/usr/bin/env python3
import requests
import re
def fetch(url):
if url.startswith('/'):
url = 'http://www.oia.ntu.edu.tw' + url
request = requests.get(url)
request.encoding = 'utf8'
return request.text
list_text = fetch('http://www.oia.ntu.edu.tw/ch/outgoing/school.list')
schools = re.findall(r'href="([^"]+)" target="_blank"\>申請資料\</a\>', list_text)
filters = [
(r'\<[^>]+\>', ''),
(r'(&nbsp;|\n)', ' '),
(r'\s+', ' '),
#(r'[碩博][一二三四五六]/?', '+'), # Additional filters
#(r'\++', '+'),
#(r'.*?不接受.*法律學院.*', 'X'),
#(r'.*?歷年不得有不及格科目.*', 'S'),
#(r'(.)語組(.*)一般組', r'\1語組⋯⋯一般組'),
]
for school_url in schools:
school_text = fetch(school_url)
title = re.search(r"\<h1 class='forms_title'>([^\<]+)", school_text).group(1).strip()
cond = re.search(r"申請資格\</th\>(.+)\<th style='width:90px;'\>名額", school_text, re.DOTALL)
if cond:
cond_text = cond.group(1).strip()
for pattern, repl in filters:
cond_text = re.sub(pattern, repl, cond_text)
#if '語組' in cond_text and not '一般組' in cond_text and not '中語組' in cond_text:
# cond_text = 'L'
else:
cond = '(N/A)'
print(title, '\t', cond_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment