Created
October 5, 2016 17:47
-
-
Save retrage/e2c503bd107ca73577b2d6cdf4f8d8bb to your computer and use it in GitHub Desktop.
Generating json from Kokyo Higashi-Gyoen Hanadayori
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # -*- coding: utf-8 -*- | |
| from urllib import request | |
| from urllib import error | |
| from bs4 import BeautifulSoup | |
| import json | |
| from datetime import datetime | |
| def fetch_data(url): | |
| user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50" | |
| encode = "utf-8" | |
| header = {"User-agent": user_agent} | |
| req = request.Request(url, None, header) | |
| try: | |
| res = request.urlopen(req) | |
| except error.URLError as e: | |
| return e | |
| else: | |
| return res.read().decode(encode) | |
| def jsonize_data(html_data): | |
| soup = BeautifulSoup(html_data, "html.parser") | |
| date = soup.find("p", class_="mb0").string | |
| head = [col.string for col in soup.find_all("th")] | |
| table = list([head]) | |
| for (n, row) in enumerate(soup.find_all("tr")): | |
| if n < 2: | |
| continue | |
| col = [child.string for child in row.children if child != "\n"] | |
| table.append(col) | |
| description = soup.find_all("p")[2].string | |
| waka = soup.find("div", class_="md-waka").find("span").string | |
| res = {"date": date, "table": table, | |
| "description": description, "waka": waka} | |
| return res | |
| def main(): | |
| url = "http://www.kunaicho.go.jp/event/hanadayori/hanadayori.html" | |
| data = fetch_data(url) | |
| if not isinstance(data, str): | |
| print("Error") | |
| return -1 | |
| json_data = jsonize_data(data) | |
| today = datetime.now() | |
| with open("{0:%Y%m%d}.json".format(today), "w") as fp: | |
| json.dump(json_data, fp, sort_keys=False, indent=4) | |
| return 0 | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment