The following gist is an extract of the article Building a simple crawler. It allows crawling from a URL and for a given number of bounce.
from crawler import Crawler
crawler = Crawler()
crawler.crawl('http://techcrunch.com/')
| #--------------------------------------------------------------------- | |
| # Global settings | |
| #--------------------------------------------------------------------- | |
| global | |
| log 127.0.0.1 local2 | |
| chroot /var/lib/haproxy | |
| pidfile /var/run/haproxy.pid | |
| maxconn 4000 | |
| user haproxy |
| from IPython.display import HTML | |
| def hide_code(): | |
| return HTML('''<script> | |
| code_show=true; | |
| function code_toggle() { | |
| if (code_show){ | |
| $("div.input").hide(); | |
| } else { | |
| $("div.input").show(); |
| __author__ = 'uolter' | |
| """ | |
| Defines a single function, map_reduce, which takes an input | |
| dictionary i and applies the user-defined function mapper to each | |
| (input_key,input_value) pair, producing a list of intermediate | |
| keys and intermediate values. Repeated intermediate keys then | |
| have their values grouped into a list, and the user-defined | |
| function reducer is applied to the intermediate key and list of |
| __author__ = 'uolter' | |
| import map_reduce | |
| def mapper(input_key, input_value): | |
| def cut_and_clean_value(cluster): | |
| """ | |
| :param cluster: string in the format <cluster>:<value> | |
| :return: touple cluster and value. If value is NaN return 0 |
| package main | |
| import ( | |
| "encoding/json" | |
| "io/ioutil" | |
| "log" | |
| "net/http" | |
| ) | |
| type test_struct struct { |
| #!/usr/bin/env | |
| # -*- coding: utf-8 -*- | |
| import unittest | |
| """ Quicksort implementation """ | |
| def quicksort(arr): | |
| """ Quicksort a list |
| # -*- coding: utf-8 -*- | |
| import unittest | |
| index = {} | |
| class tree(object): |
| pip freeze --local | grep -v '^\-e' | cut -d = -f 1 | xargs pip install -U |
| curl https://raw.githubusercontent.com/pypa/pip/master/contrib/get-pip.py > get-pip.py; | |
| python get-pip.py; | |
| rm -f get-pip.py; | |
| # change directory here. Go in your project home dir. | |
| # cd /opt/uuid_resolver/; | |
| pip install virtualenv; | |
| virtualenv venv; | |
| # activate the virtualenv | |
| source venv/bin/activate | |
| # change here your requirements.txt location |
The following gist is an extract of the article Building a simple crawler. It allows crawling from a URL and for a given number of bounce.
from crawler import Crawler
crawler = Crawler()
crawler.crawl('http://techcrunch.com/')