Last active
September 28, 2018 18:25
-
-
Save clarete/ccd437a950252f7f30592699a8f6f2dc to your computer and use it in GitHub Desktop.
First lines of every single scraper/downloader I write in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def download(url): | |
"Use request to retrieve URL's content. It sleeps 1s before starting." | |
time.sleep(3) # Can't bring the scraped site down :) | |
return requests.get(url).text | |
def saverequest(path, req): | |
"Write `req' into file pointed by `path'. Assumes `req' is in UTF-8." | |
io.open(path, 'w', encoding='utf-8').write(req) | |
return req | |
def cachepath(url): | |
"Generate a file path for saving the URL in disk" | |
return './tmp/scraper-' + url.replace('/', '-').replace(':', '-') | |
def cachedurl(url): | |
"Retrieve URL and cache it into disk" | |
name = cachepath(url) | |
if os.path.exists(name): | |
return True, io.open(name, 'r', encoding='utf-8').read() | |
else: return False, saverequest(name, download(url)) | |
thing = cachedurl("http://link.c/foo")[1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment