still dumping material in here
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
... | |
/** | |
* A container object to house our incoming HTTP request | |
* | |
* @author Matt Phillips <[email protected]> | |
* @license http://www.gnu.org/licenses/lgpl.html GNU Lesser Public License | |
*/ | |
class http_request { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Construct wget command | |
command = 'wget ' | |
command = command + '--quiet ' # turn off wget's output | |
command = command + '--tries=' + str(settings.NUMBER_RETRIES) + ' ' # number of retries (assuming no 404 or the like) | |
command = command + '--wait=' + str(settings.WAIT_BETWEEN_TRIES) + ' ' # number of seconds between requests (lighten the load on a page that has a lot of assets) | |
command = command + '--quota=' + settings.ARCHIVE_QUOTA + ' ' # only store this amount | |
command = command + '--random-wait ' # random wait between .5 seconds and --wait= | |
command = command + '--limit-rate=' + settings.ARCHIVE_LIMIT_RATE + ' ' # we'll be performing multiple archives at once. let's not download too much in one stream | |
command = command + '--adjust-extension ' # if a page is served up at .asp, adjust to .html. (this is the new --html-extension flag) | |
command = command + '--span-hosts ' # sometimes things like images are hosted at a CDN. let's span-hosts to get those |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function check_status() { | |
// Check our status service to see if we have archivng jobs pending | |
var request = $.ajax({ | |
url: status_url + newLinky.linky_id, | |
type: "GET", | |
dataType: "json", | |
cache: false | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
# We've now imported the two packages that will do the heavy lifting | |
# for us, reqeusts and BeautifulSoup | |
# Let's put the URL of the page we want to scrape in a variable | |
# so that our code down below can be a little cleaner | |
url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import time | |
# We've now imported the two packages that will do the heavy lifting | |
# for us, reqeusts and BeautifulSoup | |
# This is the URL that lists the current inmates | |
# Should this URL go away, and archive is available at | |
# http://perma.cc/2HZR-N38X |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
url_to_scrape = 'http://apps2.polkcountyiowa.gov/inmatesontheweb/' | |
r = requests.get(url_to_scrape) | |
soup = BeautifulSoup(r.text) | |
inmates_links = [] | |
for table_row in soup.select(".inmatesList tr"): | |
table_cells = table_row.findAll('td') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inmates = [] | |
for inmate_link in inmates_links[:10]: | |
r = requests.get(inmate_link) | |
soup = BeautifulSoup(r.text) | |
inmate_details = {} | |
inmate_profile_rows = soup.select("#inmateProfile tr") | |
inmate_details['age'] = inmate_profile_rows[0].findAll('td')[0].text.strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inmate_cities = {} | |
for inmate in inmates: | |
if inmate['city'] in inmate_cities: | |
inmate_cities[inmate['city']] += 1 | |
else: | |
inmate_cities[inmate['city']] = 1 | |
print inmate_cities |
OlderNewer