Skip to content

Instantly share code, notes, and snippets.

@zyocum
Forked from theriley106/housingScrape.py
Last active February 10, 2018 16:58
Show Gist options
  • Save zyocum/56043635fa6581c97b8c01fd4d06b95d to your computer and use it in GitHub Desktop.
Save zyocum/56043635fa6581c97b8c01fd4d06b95d to your computer and use it in GitHub Desktop.
Scraping Valid Addresses from all US ZipCodes
#!/usr/bin/env python3
"""Scrape first 10 address results for each U.S. Zip Code from assist2sell.com"""
import json
import sys
from itertools import chain
import zipcode as zc
import requests
from progressbar import ProgressBar
from progressbar.widgets import Bar, SimpleProgress, Percentage, Timer, AdaptiveETA
from bs4 import BeautifulSoup
URL = 'https://assist2sell.com/homes/{}'
def handle(response, encoding=None):
"""Convenience method to throw away bad responses and print error codes
An encoding can be specified to override the response encoding"""
if response.status_code == 200:
encoding = response.encoding if (encoding is None) else encoding
return (
response.url,
BeautifulSoup(response.content.decode(encoding), 'html5lib')
)
else:
message = (
f'HTTP Status {response.status_code} ({response.reason}): '
f'{response.url}'
)
print(message, file=sys.stderr)
return response.url, None
def record(span, zipcode):
return {
'url': 'https://assist2sell.com' + span.a.attrs['href'],
'text': span.text.strip(),
'address': {
meta.attrs['itemprop']: meta.attrs['content']
for meta in span.find_all('meta')
},
'zipcode': {
k: v for k, v in zipcode.__dict__.items()
if not k.startswith('_')
}
}
def addresses(zipcode):
url, page = handle(requests.get(URL.format(zipcode.zip)))
if page is not None:
schema = 'http://schema.org/PostalAddress'
for span in page.find_all('span', {'itemtype': schema}):
yield record(span, zipcode)
def zipcodes():
yield from chain(*(zc.islike(f'{i}') for i in range(10)))
def main():
progress = ProgressBar(
widgets=[
Bar(),
' ',
SimpleProgress(),
' ',
Percentage(),
' ',
Timer(),
' ',
AdaptiveETA()
]
)
for zipcode in progress(list(zipcodes())):
for address in addresses(zipcode):
print(json.dumps(address, ensure_ascii=False))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=__doc__
)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment