Created
August 16, 2017 01:21
-
-
Save hapo31/f9ba7035b1bf70a52cc49a86c3bdc615 to your computer and use it in GitHub Desktop.
wikipediaの最近更新された項目から記事タイトルと本文を抽出し、行区切りで出力する
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import requests | |
| import lxml.html | |
| import re | |
| import codecs | |
| # require libs: | |
| # requests, lxml, cssselect | |
| def main(): | |
| base = "https://ja.wikipedia.org" | |
| url = base + "/w/index.php?title=%E7%89%B9%E5%88%A5:%E6%9C%80%E8%BF%91%E3%81%AE%E6%9B%B4%E6%96%B0&limit=100&days=7" | |
| html = requests.get(url).text | |
| root = lxml.html.fromstring(html) | |
| ul = root.cssselect(".mw-title > a") | |
| urls = [] | |
| with open("titles.csv", "w") as f: | |
| for li in ul: | |
| urls.append(base + li.attrib['href']) | |
| text = li.text_content() | |
| text = re.sub("""[:;()~'&%$#"!/\,.]""", "-", text) | |
| f.write(text + '\n') | |
| with open("bodies.csv", "w") as f: | |
| for wikipage in urls: | |
| content = lxml.html.fromstring(requests.get(wikipage).text) | |
| p_list = content.cssselect(".mw-parser-output > p") | |
| body = "" | |
| for p in p_list: | |
| body += p.text_content() | |
| if body: | |
| f.write(body + '\n') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment