Created
July 24, 2017 17:47
-
-
Save adamalesandro/3e08ce575c3e3f8bd71ae58609ff309d to your computer and use it in GitHub Desktop.
Info for coding test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Sample wayback timestamp url: http://web.archive.org/web/20141214092900/http://www.coinbase.com/about | |
URLS = [ | |
{"url": "http://www.coinbase.com/about", "parser": "coinbase"} | |
] | |
class WaybackConstants: | |
TIMEMAP_BASE_URL = "http://web.archive.org/web/timemap/json/{0}" | |
WAYBACK_DATETIME_FORMAT = "%Y%m%d%H%M%S" | |
WAYBACK_DIRECT_URL = "http://web.archive.org/web/{timestamp}/{target_url}" | |
def get_data_for_wayback_timemap(url=None, parser=None): | |
assert url is not None | |
assert parser is not None | |
timemap = requests.get(WaybackConstants.TIMEMAP_BASE_URL.format(url)) | |
wayback_snapshots = json.loads(timemap.content) | |
waybacks_iterator = iter(wayback_snapshots) | |
next(waybacks_iterator) | |
wayback_history = [] | |
for wayback_record in waybacks_iterator: | |
### BEGIN HERE | |
if __name__ == '__main__': | |
for url in URLS: | |
get_data_for_wayback_timemap(url["url"], url["parser"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://web.stanford.edu/~zlotnick/TextAsData/Web_Scraping_with_Beautiful_Soup.html