Skip to content

Instantly share code, notes, and snippets.

@YoshihitoAso
Created February 27, 2014 11:31
Show Gist options
  • Save YoshihitoAso/9248476 to your computer and use it in GitHub Desktop.
Save YoshihitoAso/9248476 to your computer and use it in GitHub Desktop.
[Python][PyQuery]pyqueryでスクレイピングを行うサンプル(毎日新聞のアーカイブ)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pyquery import PyQuery
import json
q = PyQuery(url='http://mainichi.jp/select/biz/archive/')
url = 'http://mainichi.jp/select/news/'
links = []
for elem in q.find('a'):
#PyQuery
q2 = PyQuery(elem)
linkurl = str(q2.attr('href'))
if linkurl.find(url) > -1:
links.append(q2.attr('href'))
for link in links:
print link
q3 = PyQuery(url=link)
# title
ttlelem = q3.find('title')
q4 = PyQuery(ttlelem)
#print q4.text()
text = ""
# article
for news_body in q3.find("div.NewsBody"):
news_body_query = PyQuery(news_body)
for pelem in news_body_query.find('p'):
q5 = PyQuery(pelem)
text_tmp = q5.text()
text = text + text_tmp
#print text
article = {"url":link,"text":text}
print article
with open('mainichi.txt','a') as outfile:
json.dump(article, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment