Created
October 1, 2012 07:05
-
-
Save ikegami-yukino/3810017 to your computer and use it in GitHub Desktop.
複数サイトから注目キーワードを取得する
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib, re, os | |
from BeautifulSoup import BeautifulSoup | |
urls = (\ | |
'http://search.biglobe.ne.jp/rss/ranking.xml',\ | |
'http://trackword.rssfeed.cc/index.xml',\ | |
'http://www.jtb.co.jp/ranking/keyword/rss.aspx',\ | |
'http://www.nilab.info/buzztube/buzztube.xml',\ | |
'http://ranking.goo.ne.jp/rss/keyword/keyrank_all1/index.rdf',\ | |
'http://searchranking.yahoo.co.jp/rss/burst_ranking-rss.xml',\ | |
'http://d.hatena.ne.jp/hotkeyword?mode=rss',\ | |
'http://www.google.co.jp/trends/hottrends/atom/feed?pn=p4' | |
) | |
naver_url = 'http://topicwords.naver.jp/ranking' | |
wdir = os.path.abspath(os.path.dirname(__file__))+'/' | |
filename = wdir+'trend.txt' | |
html_entities = BeautifulSoup.HTML_ENTITIES | |
kagikakko = re.compile(u'「(.*)」') | |
sumikakko = re.compile(u'【.*】') | |
naver_topics = re.compile('topics\?q=([^&]+)') | |
got_keywords = [] | |
# URLを開く(引数のbsをFalseにするとBeautiful Soupを使わない) | |
def openURL(url, bs=True): | |
data = urllib.urlopen(url).read() | |
return BeautifulSoup(data,convertEntities = html_entities) if bs is True else data | |
# RSSからキーワード抽出 | |
def get_rss_keywords(url): | |
global got_keywords | |
data = openURL(url) | |
for item in data.findAll('item'): | |
for title in item.findAll('title'): | |
keywords = sumikakko.sub('',title.string) | |
if kagikakko.search(keywords): | |
got_keywords.append(''.join(kagikakko.findall(keywords))) | |
elif len(keywords.split(' ')) > 1: | |
for keyword in keywords.split(' '): | |
got_keywords.append(keyword) | |
else: | |
got_keywords.append(keywords) | |
def main(): | |
global got_keywords | |
# RSSからキーワード抽出 | |
for url in urls: | |
get_rss_keywords(url) | |
# NAVERトピックワードランキングから抽出 | |
data = openURL(naver_url, bs=False) | |
for keyword in naver_topics.findall(data): | |
keywords = urllib.unquote(keyword) | |
for keyword in keywords.split('+'): | |
got_keywords.append(keyword) | |
got_keywords = list(set(got_keywords)) | |
# ファイルに書き込む | |
with open(filename, 'a') as outfile: | |
outfile.write('\n'.join(got_keywords)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment