Created
January 29, 2014 21:46
-
-
Save derrickturk/8697859 to your computer and use it in GitHub Desktop.
In the game of web scraping, you win... or you die.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import request | |
from sys import argv, stdout, stderr | |
from time import sleep | |
from random import randrange | |
import re | |
def main(argv): | |
if len(argv) == 1: | |
try: | |
with open(argv[0], 'r') as f: | |
apis = [l.rstrip() for l in f] | |
except: | |
apis = argv | |
else: | |
apis = argv | |
apis = [api if len(api) == 8 else api[2:10] for api in apis] | |
output = [] | |
for api in apis: | |
res = request.urlopen( | |
'http://opi.consrv.ca.gov/opi/opi.dll/' | |
'Search?UsrP_ID=100100100&WMtr_APINumber={0}'.format(api)) | |
welldata = res.read().decode('ascii') | |
sleep(0.1) | |
if 'frameset' in welldata: | |
pool = process_single(welldata) | |
output.append(('04' + api + '0000',) + pool) | |
else: | |
pools = process_multiple(welldata) | |
for pool in pools: | |
output.append(('04' + api + '0000',) + pool) | |
sleep(0.1) | |
stdout.write('\t'.join(('API', 'Pool', 'Pool Status', 'Well Type')) + '\n') | |
stdout.write('\n'.join('\t'.join(rec) for rec in output)) | |
def process_single(wellpage): | |
headerframe = 'http://opi.consrv.ca.gov' + \ | |
re.search(r'HeaderFrame" src="([^"]+)"', wellpage).group(1) | |
headerdata = request.urlopen(headerframe).read().decode('ascii') | |
pool = re.search( | |
r'Pool:.*?<input type="text".*?value\s*=\s*"([^"]+)"', | |
headerdata).group(1) | |
pool_status = re.search( | |
r'Pool Status:.*?<input type="text".*?value\s*=\s*"([^"]+)"', | |
headerdata).group(1) | |
well_type = re.search( | |
r'Well Type:.*?<input type="text".*?value\s*=\s*"([^"]+)"', | |
headerdata).group(1) | |
return (pool, pool_status, well_type) | |
def process_multiple(wellpage): | |
pages = int(re.search(r'Page 1 of (\d+)', wellpage).group(1)) | |
if pages > 1: | |
raise 'Too many pages, for now.' | |
pools = [m.group(1).replace(' ', '') for m in re.finditer( | |
r'<tr class=record>(?:<td>.*?<\/td>){18}<td>(.*?)</td>', wellpage)] | |
pool_statii = [m.group(1).replace(' ', '') for m in re.finditer( | |
r'<tr class=record>(?:<td>.*?<\/td>){9}<td>(.*?)</td>', wellpage)] | |
well_types = [m.group(1).replace(' ', '') for m in re.finditer( | |
r'<tr class=record>(?:<td>.*?<\/td>){8}<td>(.*?)</td>', wellpage)] | |
return zip(pools, pool_statii, well_types) | |
if __name__ == '__main__': | |
if len(argv) <= 1: | |
print('Usage: {0} [api-file|api-list...]'.format(argv[0]), file=stderr) | |
quit() | |
main(argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment