rschiang · September 15, 2016 02:34
diff --git a/get_schools.py b/get_schools.py
 #!/usr/bin/env python3
 import requests
 import re

 def fetch(url):
    if url.startswith('/'):
        url = 'http://www.oia.ntu.edu.tw' + url
    request = requests.get(url)
    request.encoding = 'utf8'
    return request.text

 list_text = fetch('http://www.oia.ntu.edu.tw/ch/outgoing/school.list')
 schools = re.findall(r'href="([^"]+)" target="_blank"\>申請資料\</a\>', list_text)
 filters = [
    (r'\<[^>]+\>', ''),
    (r'(&nbsp;|\n)', ' '),
    (r'\s+', ' '),
    #(r'[碩博][一二三四五六]/?', '+'),      # Additional filters
    #(r'\++', '+'),
    #(r'.*?不接受.*法律學院.*', 'X'),
    #(r'.*?歷年不得有不及格科目.*', 'S'),
    #(r'(.)語組(.*)一般組', r'\1語組⋯⋯一般組'),
 ]

 for school_url in schools:
    school_text = fetch(school_url)
    title = re.search(r"\<h1 class='forms_title'>([^\<]+)", school_text).group(1).strip()
    cond = re.search(r"申請資格\</th\>(.+)\<th style='width:90px;'\>名額", school_text, re.DOTALL)
    if cond:
        cond_text = cond.group(1).strip()
        for pattern, repl in filters:
            cond_text = re.sub(pattern, repl, cond_text)

        #if '語組' in cond_text and not '一般組' in cond_text and not '中語組' in cond_text:
        #    cond_text = 'L'
    else:
        cond = '(N/A)'
    print(title, '\t', cond_text)
	#!/usr/bin/env python3
	import requests
	import re

	def fetch(url):
	if url.startswith('/'):
	url = 'http://www.oia.ntu.edu.tw' + url
	request = requests.get(url)
	request.encoding = 'utf8'
	return request.text

	list_text = fetch('http://www.oia.ntu.edu.tw/ch/outgoing/school.list')
	schools = re.findall(r'href="([^"]+)" target="_blank"\>申請資料\</a\>', list_text)
	filters = [
	(r'\<[^>]+\>', ''),
	(r'( \|\n)', ' '),
	(r'\s+', ' '),
	#(r'[碩博][一二三四五六]/?', '+'), # Additional filters
	#(r'\++', '+'),
	#(r'.?不接受.法律學院.*', 'X'),
	#(r'.?歷年不得有不及格科目.', 'S'),
	#(r'(.)語組(.*)一般組', r'\1語組⋯⋯一般組'),
	]

	for school_url in schools:
	school_text = fetch(school_url)
	title = re.search(r"\<h1 class='forms_title'>([^\<]+)", school_text).group(1).strip()
	cond = re.search(r"申請資格\</th\>(.+)\<th style='width:90px;'\>名額", school_text, re.DOTALL)
	if cond:
	cond_text = cond.group(1).strip()
	for pattern, repl in filters:
	cond_text = re.sub(pattern, repl, cond_text)

	#if '語組' in cond_text and not '一般組' in cond_text and not '中語組' in cond_text:
	# cond_text = 'L'
	else:
	cond = '(N/A)'
	print(title, '\t', cond_text)