Skip to content

Instantly share code, notes, and snippets.

@hulucc
Created March 27, 2020 06:17
Show Gist options
  • Save hulucc/7fbd63b69644fd4757e845a2a92f8da0 to your computer and use it in GitHub Desktop.
Save hulucc/7fbd63b69644fd4757e845a2a92f8da0 to your computer and use it in GitHub Desktop.
import urllib.request
import pprint
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
self.column = 0
self.value = None
self.key = None
self.table = []
super().__init__()
def handle_starttag(self, tag, attrs):
pass
if tag == "tr" and dict(attrs).get('class') == 'alt':
self.data = {}
self.table.append(self.data)
return
if tag == "td" and len(attrs) == 0 and self.column == 0:
self.key = "order"
self.column += 1
return
if tag == "td" and len(attrs) == 0 and self.column == 1:
self.key = "name"
self.column += 1
return
if tag == "td" and len(attrs) == 0 and self.column == 2:
self.key = "province"
self.column += 1
return
if tag == "td" and len(attrs) == 0 and self.column == 3:
self.key = "score"
self.column = 0
return
def handle_endtag(self, tag):
pass
def handle_data(self, data):
if self.key:
self.data[self.key] = data
self.key = None
def main():
province = '广东'
html = urllib.request.urlopen('http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html').read().decode('utf8')
parser = MyHTMLParser()
parser.feed(html)
result = list(filter(lambda x: x.get('province') == province, parser.table))
pprint.pprint(result)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment