Skip to content

Instantly share code, notes, and snippets.

@hapo31
Created August 16, 2017 01:21
Show Gist options
  • Save hapo31/f9ba7035b1bf70a52cc49a86c3bdc615 to your computer and use it in GitHub Desktop.
Save hapo31/f9ba7035b1bf70a52cc49a86c3bdc615 to your computer and use it in GitHub Desktop.
wikipediaの最近更新された項目から記事タイトルと本文を抽出し、行区切りで出力する
# -*- coding: utf-8 -*-
import requests
import lxml.html
import re
import codecs
# require libs:
# requests, lxml, cssselect
def main():
base = "https://ja.wikipedia.org"
url = base + "/w/index.php?title=%E7%89%B9%E5%88%A5:%E6%9C%80%E8%BF%91%E3%81%AE%E6%9B%B4%E6%96%B0&limit=100&days=7"
html = requests.get(url).text
root = lxml.html.fromstring(html)
ul = root.cssselect(".mw-title > a")
urls = []
with open("titles.csv", "w") as f:
for li in ul:
urls.append(base + li.attrib['href'])
text = li.text_content()
text = re.sub("""[:;()~'&%$#"!/\,.]""", "-", text)
f.write(text + '\n')
with open("bodies.csv", "w") as f:
for wikipage in urls:
content = lxml.html.fromstring(requests.get(wikipage).text)
p_list = content.cssselect(".mw-parser-output > p")
body = ""
for p in p_list:
body += p.text_content()
if body:
f.write(body + '\n')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment