Created
May 25, 2016 04:26
-
-
Save dmukhg/1b17df3c1b57b45c9735cc3d44a24537 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Crawler: | |
def __init__(self, base_url, writer, predicate): | |
""" | |
* base_url is the first URL that would be crawled. | |
* writer is a class that is defined below. It is in-charge of writing | |
to the file system. | |
* predicate is a function with signature predicate(url). It | |
returns true, false and on the basis of that, the decision on | |
whether to download a link or not is taken. """ | |
.... | |
def start(self): | |
""" Starts the crawler. While one should avoid | |
lifecycle methods like these unless using a framework, | |
Starting the crawler as soon as you create it doesn't | |
seem like the right thing to do.""" | |
.... | |
def _get_crawlable_URLs(self, body): | |
""" Reads the contents of a requests response body | |
and returns URLs to crawl. Runs URLs against | |
self.predicate. """ | |
... | |
def _write(self, url, body): | |
""" Uses writer to write the body of a response via | |
self.writer""" | |
class BaseWriter: | |
def __init__(self, base_url): | |
""" Creates whatever buckets or folders you need to create so that | |
writing can start. We want this level of separation to be able to | |
move to a new repository as and when required. Say you want to write | |
files to AWS S3 rather than your local file system. """ | |
def write(self, url, body): | |
""" Creates file like object and writes the contents to it. """ | |
class LocalFileSystemWriter(BaseWriter): | |
""" Uses the local file system to write files. The init method creates | |
folders for the base URL. The write() method creates files relative to | |
it and writes contents. """ | |
def maxCountPredicate(max_count, nextPredicate): | |
def maxCountInner(self, url): | |
if (nextPredicate(url)): | |
max_count = max_count - 1 | |
return max_count >= 0 | |
return false | |
return maxCountInner | |
def sameBaseUrlPredicate(base_url): | |
def baseUrlInner(self, url): | |
return url.startsWith(base_url) | |
return baseUrlInner | |
""" | |
The last two functions are kind of clever coding. We are using closures to | |
create local state for the functions and are chaining predicates. | |
I have been using javascript way too long to think about anything but closures. | |
Sorry about that. :P | |
""" | |
if __name__ === '__main__': | |
base_url = sys.argv[].... | |
max_count = sys.argv[]... | |
writer = LocalFileSystemWriter(self, base_url) | |
predicate = maxCountPredicate(max_count, sameBaseUrlPredicate(base_url)) | |
crawler = Crawler(base_url, writer, predicate) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment