Skip to content

Instantly share code, notes, and snippets.

@snakeneedy
Last active October 27, 2016 01:50
Show Gist options
  • Save snakeneedy/1527818d7a4f81a3e8341f6265a90b04 to your computer and use it in GitHub Desktop.
Save snakeneedy/1527818d7a4f81a3e8341f6265a90b04 to your computer and use it in GitHub Desktop.
Parse data from 'http://www.cwb.gov.tw/V7/observe/real/windAll.htm' into a object-list 'observeList' without print.
#!/usr/bin/python3
# target: "縣市"、"測站"、"風向"、"風速"、"陣風" 與 "時間"
"""
1. load html into string
2. parse string into class array
3. convert class into json
"""
import re
import json
import urllib.request
class Wind:
def __init__(self, Direction=None, Speed=None, Gust=None, Time=None):
self.Direction = Direction
self.Speed = Speed
self.Gust = Gust
self.Time = Time
def __str__(self):
return '{{"Direction": "{0}", "Speed": "{1}", "Gust": "{2}", "Time": "{3}"}}'.format(self.Direction, self.Speed, self.Gust, self.Time)
class Observe:
def __init__(self, County=None, Station=None, WindList=[]):
self.County = County
self.Station = Station
self.WindList = WindList
def __str__(self):
WindListStr = "["
for WindListIndex in range(0, len(self.WindList) - 1):
WindListStr = "{0}{1}, ".format(WindListStr, self.WindList[WindListIndex])
WindListStr = "{0}{1}".format(WindListStr, self.WindList[len(self.WindList) - 1])
WindListStr = WindListStr + "]"
return """{{
"County": "{0}",
"Station": "{1}",
"WindList": {2}
}}""".format(self.County, self.Station, WindListStr)
### main
link = 'http://www.cwb.gov.tw/V7/observe/real/windAll.htm'
# with open('source.html', 'r') as f:
# content_html = f.read()
with urllib.request.urlopen(link) as f:
content_html = f.read().decode('UTF-8')
content_table = re.search(r"<table (.|\n|\r)*<\/table>", content_html).group(0)
content_tr_ary = re.findall(r"(<tr((((?!tr).)|\n|\r)*)<\/tr>)", content_table)
# get time_ary from content_tr_ary[0][0]
time_ary = []
content_tr_time = content_tr_ary[0][0]
content_time_ary = re.findall(r"(<th((((?!th).)|\n|\r)*)<\/th>)", content_tr_time)
for content_time_one in content_time_ary[2:-3]:
time_ary.append( re.search(r"<th[^>]*>((((?!th).)|\n|\r)*)<\/th>", content_time_one[0]).group(1) )
# get observeList from content_tr_ary[2+][0]
observeList = [] #
for content_tr_one in content_tr_ary[2:]:
content_data_ary = re.findall(r"(<td((((?!td).)|\n|\r)*)<\/td>)", content_tr_one[0])
len_content_data_ary = len(content_data_ary)
observe = Observe()
observe.County = re.search(r"<td[^>]*>((((?!td).)|\n|\r)*)<\/td>", content_data_ary[0][0]).group(1)
observe.Station = re.search(r"<td[^>]*>((((?!td).)|\n|\r)*)<\/td>", content_data_ary[1][0]).group(1)
wind = Wind()
## Direction, Speed, Gust
time_ary_index = 0
for index in range(2, len_content_data_ary - 3*3):
if index % 3 == 2:
result = re.search(r"title='([^']*)'", content_data_ary[index][0])
if result == None:
result = re.search(r"<td[^>]*>([^<]*)<\/td>", content_data_ary[index][0])
# <td[^>]*>([^<]*)<\/td>
# break
wind.Direction = result.group(1)
elif index % 3 == 0:
result = re.search(r"<td[^>]*>(((?!td).)*)<\/td>", content_data_ary[index][0])
if result == None:
break
wind.Speed = result.group(1)
elif index % 3 == 1:
result = re.search(r"<td[^>]*>(((?!td).)*)<\/td>", content_data_ary[index][0])
if result == None:
break
wind.Gust = result.group(1)
wind.Time = time_ary[time_ary_index]
observe.WindList.append(wind)
wind = Wind() # new a Wind object
time_ary_index = time_ary_index + 1
observeList.append(observe)
### print all data
# print('{')
# print('"result": [')
# for index in range(len(observeList)):
# if index + 1 < len(observeList):
# print('{0}'.format(observeList[index]), end=',\n')
# else:
# print('{0}'.format(observeList[index]))
# print(']}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment