Created
February 3, 2016 06:36
-
-
Save tribela/138d615d39bc2b376ea4 to your computer and use it in GitHub Desktop.
Earth reader crawl old entries using archive.org
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
import datetime | |
import re | |
import sys | |
import requests | |
from libearth.repository import from_url | |
from libearth.session import Session | |
from libearth.stage import Stage | |
from libearth.crawler import crawl, CrawlError | |
from libearth.parser.autodiscovery import autodiscovery, FeedUrlNotFoundError | |
def get_rss_url(url): | |
document = requests.get(url).content | |
try: | |
feed_links = autodiscovery(document, url) | |
except FeedUrlNotFoundError as e: | |
print(e, file=sys.stderr) | |
exit(1) | |
feed_url = feed_links[0].url | |
return feed_url | |
def importer(stage): | |
with stage: | |
subs = stage.subscriptions | |
feed_map = dict((sub.feed_uri, sub.feed_id) | |
for sub in subs.recursive_subscriptions) | |
for sub in subs.recursive_subscriptions: | |
print(sub.label) | |
for feed_uri, feed_id in feed_map.items(): | |
import_feed(stage, feed_uri, feed_id) | |
def import_feed(stage, url, feed_id): | |
feed_url = get_rss_url(url) | |
print(feed_url) | |
links = re.compile(r'/web/\d{14}/' + feed_url) | |
urls = [] | |
current_year = datetime.datetime.now().year | |
for year in range(1996, current_year + 1): | |
document = requests.get( | |
'http://web.archive.org/web/{}*/{}'.format( | |
year, feed_url | |
)).content | |
urls += links.findall(document) | |
urls = map(lambda x: 'http://web.archive.org' + x, urls) | |
print(len(urls)) | |
generator = crawl(urls, 20) | |
try: | |
for feed_url, feed_data, crawler_hints in generator: | |
with stage: | |
stage.feeds[feed_id] = feed_data | |
except CrawlError as e: | |
print(e, file=sys.stderr) | |
def main(): | |
repo_url = sys.argv[1] | |
repo = from_url(repo_url) | |
session = Session() | |
stage = Stage(session, repo) | |
importer(stage) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment