Created
August 26, 2013 11:56
-
-
Save yat1ma30/6340722 to your computer and use it in GitHub Desktop.
与えられたURLからRSSフィードのURLをリトリーブ。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import requests | |
from BeautifulSoup import BeautifulSoup | |
import urlparse | |
def get(url): | |
"""URLからフィードURLをリトリーブします。 | |
もしもURLにアクセスできなかった、 | |
またはフィードが見つからなかった場合はNoneを返します。 | |
""" | |
# httpがついていなかった場合 | |
if ("http" not in url): | |
url = "http://{0}".format(url) | |
p = urlparse.urlparse(url) | |
# http://xxx.com/aaa/bbb/ => http://xxx.com | |
root = "{0}://{1}".format(p.scheme, p.hostname) | |
try: | |
r = requests.get(url) | |
except: | |
return None | |
else: | |
# URLにアクセスできた時はフィードを取得。 | |
soup = BeautifulSoup(r.text) | |
feed = soup.find('link', attrs={'type': 'application/atom+xml'}) | |
rss = soup.find('link', attrs={'type': 'application/rss+xml'}) | |
if feed: | |
return feed['href'] if ("http" in feed['href']) else urlparse.urljoin(root, feed['href']) | |
elif rss: | |
return rss['href'] if ("http" in rss) else urlparse.urljoin(root, rss['href']) | |
else: | |
return None | |
if __name__ == "__main__": | |
url = "ottati.hatenablog.com" | |
print get(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment