Skip to content

Instantly share code, notes, and snippets.

@tomowarkar
Last active May 21, 2021 03:49
Show Gist options
  • Save tomowarkar/b74baedecf53712c2c4f34f252930664 to your computer and use it in GitHub Desktop.
Save tomowarkar/b74baedecf53712c2c4f34f252930664 to your computer and use it in GitHub Desktop.
プロ野球試合結果
from .extractor import CacheExtractor, Extractor
from .npb_games import BaseballExtractor
from .npb_games_detail import BaseballDetailsExtractor

Usage

git clone https://gist.github.com/b74baedecf53712c2c4f34f252930664.git extractor
from pprint import pprint
from extractor import BaseballExtractor, BaseballDetailsExtractor


pprint(BaseballExtractor("https://npb.jp/bis/2021/games/").extract())
pprint(BaseballDetailsExtractor("https://npb.jp/bis/2021/games/s2021041000947.html").extract())
import hashlib
import os
import re
import requests
from bs4 import BeautifulSoup
class Extractor:
VALID_URL = r"~"
def __init__(self, url):
self.url = url
@classmethod
def match(cls, url):
return re.match(cls.VALID_URL, url)
@staticmethod
def get_webpage(url):
return requests.get(url).content.decode()
def _extract(self):
raise NotImplementedError
def extract(self):
content = self.get_webpage(self.url)
self._soup = BeautifulSoup(content, "html.parser")
return self._extract()
class CacheExtractor(Extractor):
def extract(self):
hs = hashlib.md5(self.url.encode()).hexdigest()
cache = f"{hs}.cache"
if os.path.exists(cache):
with open(cache, "r") as f:
content = f.read()
else:
content = self.get_webpage(self.url)
with open(cache, "w") as f:
f.write(content)
self._soup = BeautifulSoup(content, "html.parser")
return self._extract()
import os
import re
from .extractor import Extractor
class BaseballExtractor(Extractor):
VALID_URL = r"https?://npb.jp/bis/(?P<year>\d{4})/games/(gm\d{8}.html)?"
@staticmethod
def default_response():
return {"date": "", "prev": "", "next": "", "result": []}
def _extract(self):
response = self.default_response()
container = self._soup.select_one("div#gmdivmain")
if container is None:
return
title = container.select_one("div#gmdivtitle")
response.update({"date": title.text.strip()})
navi = container.select_one("div#gmpvnext")
if navi is None:
return
url = os.path.dirname(self.url)
pr, nx = [
f'{url}/{td.a.get("href")}' if td.a else None
for td in navi.select("td.gmpvnxlink")
]
response.update({"prev": pr, "next": nx})
gmdivlist = container.select_one("div#gmdivlist")
games = gmdivlist.select("div.contentsgame")
for game in games:
text = " ".join([g.text for g in game.select("td")])
mobj = re.finditer(
r"(?P<home>[^\d\ ]+)\ (?P<home_score>\d{1,2})?\ -\ (?P<away_score>\d{1,2})?\ (?P<away>[^\d\ ]+)\ (?P<game>中止|\d{1,2}回戦)\ (?P<std>[^\d\ ]+)",
text,
)
if mobj is None:
continue
for m in mobj:
response["result"].append(m.groupdict())
return response
if __name__ == "__main__":
from pprint import pprint
# url = "https://npb.jp/bis/2021/games/" # 年度最新
# url = "https://npb.jp/bis/2021/games/gm20210412.html" # 該当する試合がないケース
# url = "https://npb.jp/bis/2021/stats/" # 不正なurl
# url = "https://npb.jp/bis/2021/games/gm20211021.html" # 開催前の試合
# url = "https://npb.jp/bis/2019/games/gm20190604.html" # 交流戦
# url = "https://npb.jp/bis/2020/games/gm20200630.html" # 中止試合
url = "https://npb.jp/bis/2021/games/gm20210408.html"
extractor = BaseballExtractor
if extractor.match(url):
e = extractor(url)
pprint(e.extract())
# > output
#
# {'date': ' 2021年4月8日 (木)',
# 'next': 'https://npb.jp/bis/2021/games/gm20210409.html',
# 'prev': 'https://npb.jp/bis/2021/games/gm20210407.html',
# 'result': [{'away': '広島東洋',
# 'away_score': '7',
# 'game': '3回戦',
# 'home': '東京ヤクルト',
# 'home_score': '11',
# 'std': '神\u3000宮'},
# {'away': '横浜DeNA',
# 'away_score': '5',
# 'game': '3回戦',
# 'home': ' 中\u3000日',
# 'home_score': '2',
# 'std': 'バンテリンドーム'},
# {'away': '読\u3000売',
# 'away_score': '3',
# 'game': '3回戦',
# 'home': ' 阪\u3000神',
# 'home_score': '0',
# 'std': '甲子園'},
# {'away': '福岡ソフトバンク',
# 'away_score': '4',
# 'game': '3回戦',
# 'home': '北海道日本ハム',
# 'home_score': '2',
# 'std': '札幌ドーム'},
# {'away': '東北楽天',
# 'away_score': '4',
# 'game': '3回戦',
# 'home': ' 埼玉西武',
# 'home_score': '0',
# 'std': 'メットライフ'},
# {'away': 'オリックス',
# 'away_score': '5',
# 'game': '3回戦',
# 'home': ' 千葉ロッテ',
# 'home_score': '1',
# 'std': 'ZOZOマリン'}]}
from .extractor import Extractor
class BaseballDetailsExtractor(Extractor):
VALID_URL = r"https?://npb.jp/bis/(?P<year>\d{4})/games/(s\d{8}\d+?\.html)"
@staticmethod
def default_response():
return {
"date": "",
"home": {"team": "", "batter": [], "pitcher": []},
"away": {"team": "", "batter": [], "pitcher": []},
}
def _extract(self):
response = self.default_response()
container = self._soup.select_one("div#gmdivtbl")
title = self._soup.select_one("div#gmdivtitle")
response.update({"date": title.text.strip()})
table = container.select("td.gmcolorsub")
assert len(table) == 6
def find_team_name(soup):
return soup.select_one("td.gmtblteam").text.strip()
def find_results(soup):
return [
[e.text for e in status.select("td")]
for status in soup.select("tr.gmstats")
]
for idt, team in zip(["away", "home"], [table[::2], table[1::2]]):
response[idt].update({"team": find_team_name(team[0])})
batter = find_results(team[1])
batter[0] = ["position", "name", "打数", "安打", "打点", "四球", "死球", "三振"]
pitcher = find_results(team[2])
pitcher[0] = [
"status",
"name",
"投回",
"投回sub",
"打者",
"安打",
"四球",
"死球",
"三振",
"自責",
]
response[idt].update({"batter": batter, "pitcher": pitcher})
return response
if __name__ == "__main__":
from pprint import pprint
url = "https://npb.jp/bis/2021/games/s2021041000947.html"
pprint(BaseballDetailsExtractor(url).extract())
# > output
#
# {'away': {'batter': [['position', 'name', '打数', '安打', '打点', '四球', '死球', '三振'],
# ['(中)', '近本', '5', '0', '0', '0', '0', '0'],
# ['(二)', '糸原', '4', '2', '0', '0', '0', '1'],
# ['遊三', '山本', '0', '0', '0', '0', '0', '0'],
# ['(一)', 'マルテ', '3', '0', '0', '1', '0', '1'],
# ['走右', '熊谷', '0', '0', '0', '0', '0', '0'],
# ['(三)一', '大山', '4', '1', '1', '0', '0', '1'],
# ['(左)', 'サンズ', '4', '1', '2', '0', '0', '1'],
# ['左', '板山', '0', '0', '0', '0', '0', '0'],
# ['(右)', '佐藤輝', '4', '1', '0', '0', '0', '1'],
# ['遊', '木浪', '0', '0', '0', '0', '0', '0'],
# ['(捕)', '梅野', '4', '2', '0', '0', '0', '1'],
# ['(遊)二', '中野', '4', '1', '1', '0', '0', '1'],
# ['(投)', '青柳', '1', '0', '0', '0', '0', '0'],
# ['打', '糸井', '1', '0', '0', '0', '0', '1'],
# ['投', '岩崎', '0', '0', '0', '0', '0', '0'],
# ['打', '原口', '1', '0', '0', '0', '0', '1'],
# ['投', 'スアレス', '0', '0', '0', '0', '0', '0']],
# 'pitcher': [['status',
# 'name',
# '投回',
# '投回sub',
# '打者',
# '安打',
# '四球',
# '死球',
# '三振',
# '自責'],
# ['○', '青柳', '7', '', '26', '3', '2', '0', '5', '0'],
# ['H', '岩崎', '1', '', '3', '0', '0', '0', '2', '0'],
# ['', 'スアレス', '1', '', '3', '0', '0', '0', '1', '0']],
# 'team': '阪\u3000神'},
# 'date': '2021年4月10日 (土)',
# 'home': {'batter': [['position', 'name', '打数', '安打', '打点', '四球', '死球', '三振'],
# ['(右)', '関根', '4', '1', '0', '0', '0', '2'],
# ['(遊)', '柴田', '3', '0', '0', '0', '0', '0'],
# ['打遊', '大和', '1', '0', '0', '0', '0', '0'],
# ['(一)', '牧', '4', '0', '0', '0', '0', '1'],
# ['(左)', '佐野', '3', '1', '0', '1', '0', '1'],
# ['(三)', '宮﨑', '4', '1', '0', '0', '0', '0'],
# ['(中)', '神里', '3', '0', '0', '0', '0', '1'],
# ['(二)', '田中俊', '2', '0', '0', '1', '0', '0'],
# ['(捕)', '戸柱', '3', '0', '0', '0', '0', '0'],
# ['(投)', '上茶谷', '2', '0', '0', '0', '0', '2'],
# ['投', '砂田', '0', '0', '0', '0', '0', '0'],
# ['打', '桑原', '1', '0', '0', '0', '0', '1'],
# ['投', '平田', '0', '0', '0', '0', '0', '0'],
# ['投', '池谷', '0', '0', '0', '0', '0', '0']],
# 'pitcher': [['status',
# 'name',
# '投回',
# '投回sub',
# '打者',
# '安打',
# '四球',
# '死球',
# '三振',
# '自責'],
# ['●', '上茶谷', '7', '', '26', '5', '0', '0', '5', '1'],
# ['', '砂田', '1', '', '3', '0', '0', '0', '2', '0'],
# ['', '平田', '0', '.1', '4', '2', '1', '0', '1', '3'],
# ['', '池谷', '0', '.2', '4', '1', '0', '0', '1', '0']],
# 'team': '横浜DeNA'}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment