hapo31 · August 16, 2017 01:21
diff --git a/wikipedia.py b/wikipedia.py
 # -*- coding: utf-8 -*-
 import requests
 import lxml.html
 import re
 import codecs

 # require libs:
 # requests, lxml, cssselect

 def main():
  base = "https://ja.wikipedia.org"
  url = base + "/w/index.php?title=%E7%89%B9%E5%88%A5:%E6%9C%80%E8%BF%91%E3%81%AE%E6%9B%B4%E6%96%B0&limit=100&days=7"

  html = requests.get(url).text

  root = lxml.html.fromstring(html)

  ul = root.cssselect(".mw-title > a")
  urls = []

  with open("titles.csv", "w") as f:
    for li in ul:
      urls.append(base + li.attrib['href'])
      text = li.text_content()
      text = re.sub("""[:;()~'&%$#"!/\,.]""", "-", text)
      f.write(text + '\n')

  with open("bodies.csv", "w") as f:
    for wikipage in urls:
      content =  lxml.html.fromstring(requests.get(wikipage).text)
      p_list = content.cssselect(".mw-parser-output > p")
      body = ""
      for p in p_list:
        body += p.text_content()
      if body:
        f.write(body + '\n')

 if __name__ == '__main__':
  main()
	# -- coding: utf-8 --
	import requests
	import lxml.html
	import re
	import codecs

	# require libs:
	# requests, lxml, cssselect

	def main():
	base = "https://ja.wikipedia.org"
	url = base + "/w/index.php?title=%E7%89%B9%E5%88%A5:%E6%9C%80%E8%BF%91%E3%81%AE%E6%9B%B4%E6%96%B0&limit=100&days=7"

	html = requests.get(url).text

	root = lxml.html.fromstring(html)

	ul = root.cssselect(".mw-title > a")
	urls = []

	with open("titles.csv", "w") as f:
	for li in ul:
	urls.append(base + li.attrib['href'])
	text = li.text_content()
	text = re.sub("""[:;()~'&%$#"!/\,.]""", "-", text)
	f.write(text + '\n')

	with open("bodies.csv", "w") as f:
	for wikipage in urls:
	content = lxml.html.fromstring(requests.get(wikipage).text)
	p_list = content.cssselect(".mw-parser-output > p")
	body = ""
	for p in p_list:
	body += p.text_content()
	if body:
	f.write(body + '\n')

	if __name__ == '__main__':
	main()