Last active
September 27, 2015 15:08
-
-
Save sillygwailo/1289179 to your computer and use it in GitHub Desktop.
Top Domains in Readability
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re, operator, argparse, oauth | |
from readability import ReaderClient | |
from beaker.cache import cache_regions, cache_region | |
parser = argparse.ArgumentParser(description="Look at Readability.com and determines the top domains you've read articles with") | |
parser.add_argument('file', nargs=1, help='Filename to export to.') | |
cache_regions.update({ | |
'short_term':{ | |
'expire': 3600, | |
'type':'dbm', | |
'data_dir': '/tmp', | |
} | |
}) | |
@cache_region('short_term', 'sorted_domains') | |
def sorted_domains(number = -1): | |
rdd = ReaderClient('consumer-key', 'secret-key', 'username', 'password') | |
bookmarks = rdd.get_bookmarks(favorite=True) | |
domains = {} | |
for bookmark in bookmarks: | |
domain = bookmark.article.domain | |
www = re.search('www\.', domain) | |
if www == None: | |
domain = bookmark.article.domain | |
else: | |
domain = bookmark.article.domain[www.end():] | |
if not domain in domains: | |
domains[domain] = 1 | |
else: | |
domains[domain] += 1 | |
sorted_domains = sorted(domains.iteritems(), key=operator.itemgetter(1), reverse=True) | |
if number == -1: | |
pass | |
else: | |
sorted_domains = sorted_domains[:number] | |
return sorted_domains | |
if __name__ == '__main__': | |
arguments = parser.parse_args() | |
for sorted_domain in sorted_domains(arguments.number): | |
print "%s: %d" % (sorted_domain[0], sorted_domain[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment