Skip to content

Instantly share code, notes, and snippets.

@derrickturk
Created January 29, 2014 21:46
Show Gist options
  • Save derrickturk/8697859 to your computer and use it in GitHub Desktop.
Save derrickturk/8697859 to your computer and use it in GitHub Desktop.
In the game of web scraping, you win... or you die.
from urllib import request
from sys import argv, stdout, stderr
from time import sleep
from random import randrange
import re
def main(argv):
if len(argv) == 1:
try:
with open(argv[0], 'r') as f:
apis = [l.rstrip() for l in f]
except:
apis = argv
else:
apis = argv
apis = [api if len(api) == 8 else api[2:10] for api in apis]
output = []
for api in apis:
res = request.urlopen(
'http://opi.consrv.ca.gov/opi/opi.dll/'
'Search?UsrP_ID=100100100&WMtr_APINumber={0}'.format(api))
welldata = res.read().decode('ascii')
sleep(0.1)
if 'frameset' in welldata:
pool = process_single(welldata)
output.append(('04' + api + '0000',) + pool)
else:
pools = process_multiple(welldata)
for pool in pools:
output.append(('04' + api + '0000',) + pool)
sleep(0.1)
stdout.write('\t'.join(('API', 'Pool', 'Pool Status', 'Well Type')) + '\n')
stdout.write('\n'.join('\t'.join(rec) for rec in output))
def process_single(wellpage):
headerframe = 'http://opi.consrv.ca.gov' + \
re.search(r'HeaderFrame" src="([^"]+)"', wellpage).group(1)
headerdata = request.urlopen(headerframe).read().decode('ascii')
pool = re.search(
r'Pool:.*?<input type="text".*?value\s*=\s*"([^"]+)"',
headerdata).group(1)
pool_status = re.search(
r'Pool Status:.*?<input type="text".*?value\s*=\s*"([^"]+)"',
headerdata).group(1)
well_type = re.search(
r'Well Type:.*?<input type="text".*?value\s*=\s*"([^"]+)"',
headerdata).group(1)
return (pool, pool_status, well_type)
def process_multiple(wellpage):
pages = int(re.search(r'Page 1 of (\d+)', wellpage).group(1))
if pages > 1:
raise 'Too many pages, for now.'
pools = [m.group(1).replace('&nbsp', '') for m in re.finditer(
r'<tr class=record>(?:<td>.*?<\/td>){18}<td>(.*?)</td>', wellpage)]
pool_statii = [m.group(1).replace('&nbsp', '') for m in re.finditer(
r'<tr class=record>(?:<td>.*?<\/td>){9}<td>(.*?)</td>', wellpage)]
well_types = [m.group(1).replace('&nbsp', '') for m in re.finditer(
r'<tr class=record>(?:<td>.*?<\/td>){8}<td>(.*?)</td>', wellpage)]
return zip(pools, pool_statii, well_types)
if __name__ == '__main__':
if len(argv) <= 1:
print('Usage: {0} [api-file|api-list...]'.format(argv[0]), file=stderr)
quit()
main(argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment