-
-
Save jeremyfelt/4584337 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tweepy | |
from BeautifulSoup import BeautifulSoup as parser | |
import urllib | |
import sys | |
consumer_key='' | |
consumer_secret='' | |
access_token='' | |
access_token_secret='' | |
def detect_feeds_in_HTML(input_stream): | |
""" examines an open text stream with HTML for referenced feeds. | |
This is achieved by detecting all ``link`` tags that reference a feed in HTML. | |
:param input_stream: an arbitrary opened input stream that has a :func:`read` method. | |
:type input_stream: an input stream (e.g. open file or URL) | |
:return: a list of tuples ``(url, feed_type)`` | |
:rtype: ``list(tuple(str, str))`` | |
""" | |
# check if really an input stream | |
if not hasattr(input_stream, "read"): | |
raise TypeError("An opened input *stream* should be given, was %s instead!" % type(input_stream)) | |
result = [] | |
# get the textual data (the HTML) from the input stream | |
html = parser(input_stream.read()) | |
# find all links that have an "alternate" attribute | |
feed_urls = html.findAll("link", rel="alternate") | |
# extract URL and type | |
for feed_link in feed_urls: | |
url = feed_link.get("href", None) | |
# if a valid URL is there | |
if url: | |
result.append(url) | |
return result | |
def chunks(l, n): | |
""" Yield successive n-sized chunks from l. | |
""" | |
for i in xrange(0, len(l), n): | |
yield l[i:i+n] | |
auth = tweepy.OAuthHandler(consumer_key, consumer_secret) | |
auth.set_access_token(access_token, access_token_secret) | |
api = tweepy.API(auth) | |
me = api.me() | |
friends = api.friends_ids() | |
opml_start = """<?xml version="1.0" encoding="UTF-8"?> | |
<opml version="1.1"> | |
<head> | |
<title>People I follow</title> | |
</head> | |
<body> | |
<outline text="People I follow" title="People I follow">""" | |
opml_end = """</outline> | |
</body> | |
</opml>""" | |
opml_outline_feed = '<outline text="%(title)s" title="%(title)s" type="rss" version="RSS" htmlUrl="%(html_url)s" xmlUrl="%(xml_url)s" />' | |
print opml_start | |
for c in chunks(friends, 100): | |
users = api.lookup_users(c) | |
for u in users: | |
if u.url: | |
print "<!-- %s -->" % u.screen_name | |
try: | |
site = urllib.urlopen(u.url) | |
tuples = detect_feeds_in_HTML(site) | |
for t in tuples: | |
html = parser(u.url, convertEntities=parser.HTML_ENTITIES).contents[0] | |
if "http" in t: | |
xml = parser(t, convertEntities=parser.HTML_ENTITIES).contents[0] | |
else: | |
myxml = html + t | |
xml = parser(myxml, convertEntities=parser.HTML_ENTITIES).contents[0] | |
print opml_outline_feed % {'title': u.name, 'html_url': html, 'xml_url': xml} | |
except Exception, err: | |
sys.stderr.write('ERROR: %s\n' % str(err)) | |
pass | |
print opml_end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment