-
-
Save zyocum/56043635fa6581c97b8c01fd4d06b95d to your computer and use it in GitHub Desktop.
Scraping Valid Addresses from all US ZipCodes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Scrape first 10 address results for each U.S. Zip Code from assist2sell.com""" | |
import json | |
import sys | |
from itertools import chain | |
import zipcode as zc | |
import requests | |
from progressbar import ProgressBar | |
from progressbar.widgets import Bar, SimpleProgress, Percentage, Timer, AdaptiveETA | |
from bs4 import BeautifulSoup | |
URL = 'https://assist2sell.com/homes/{}' | |
def handle(response, encoding=None): | |
"""Convenience method to throw away bad responses and print error codes | |
An encoding can be specified to override the response encoding""" | |
if response.status_code == 200: | |
encoding = response.encoding if (encoding is None) else encoding | |
return ( | |
response.url, | |
BeautifulSoup(response.content.decode(encoding), 'html5lib') | |
) | |
else: | |
message = ( | |
f'HTTP Status {response.status_code} ({response.reason}): ' | |
f'{response.url}' | |
) | |
print(message, file=sys.stderr) | |
return response.url, None | |
def record(span, zipcode): | |
return { | |
'url': 'https://assist2sell.com' + span.a.attrs['href'], | |
'text': span.text.strip(), | |
'address': { | |
meta.attrs['itemprop']: meta.attrs['content'] | |
for meta in span.find_all('meta') | |
}, | |
'zipcode': { | |
k: v for k, v in zipcode.__dict__.items() | |
if not k.startswith('_') | |
} | |
} | |
def addresses(zipcode): | |
url, page = handle(requests.get(URL.format(zipcode.zip))) | |
if page is not None: | |
schema = 'http://schema.org/PostalAddress' | |
for span in page.find_all('span', {'itemtype': schema}): | |
yield record(span, zipcode) | |
def zipcodes(): | |
yield from chain(*(zc.islike(f'{i}') for i in range(10))) | |
def main(): | |
progress = ProgressBar( | |
widgets=[ | |
Bar(), | |
' ', | |
SimpleProgress(), | |
' ', | |
Percentage(), | |
' ', | |
Timer(), | |
' ', | |
AdaptiveETA() | |
] | |
) | |
for zipcode in progress(list(zipcodes())): | |
for address in addresses(zipcode): | |
print(json.dumps(address, ensure_ascii=False)) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
description=__doc__ | |
) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment