-
-
Save jhaubrich/bd9fa2914d849a074838becc6bfa151d to your computer and use it in GitHub Desktop.
Python 3 script to find real estate listings of properties up for sale on zillow.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
import unicodecsv as csv | |
import argparse | |
import json | |
def clean(text): | |
if text: | |
return ' '.join(' '.join(text).split()) | |
return None | |
def get_headers(): | |
# Creating headers. | |
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'accept-encoding': 'gzip, deflate, sdch, br', | |
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', | |
'cache-control': 'max-age=0', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} | |
return headers | |
def create_url(zipcode, filter): | |
# Creating Zillow URL based on the filter. | |
if filter == "newest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode) | |
elif filter == "cheapest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode) | |
else: | |
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) | |
print(url) | |
return url | |
def save_to_file(response): | |
# saving response to `response.html` | |
with open("response.html", 'w') as fp: | |
fp.write(response.text) | |
def write_data_to_csv(data): | |
# saving scraped data to csv. | |
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile: | |
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
def get_response(url): | |
# Getting response from zillow.com. | |
for i in range(5): | |
response = requests.get(url, headers=get_headers()) | |
print("status code received:", response.status_code) | |
if response.status_code != 200: | |
# saving response to file for debugging purpose. | |
save_to_file(response) | |
continue | |
else: | |
save_to_file(response) | |
return response | |
return None | |
def get_data_from_json(raw_json_data): | |
# getting data from json (type 2 of their A/B testing page) | |
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "") | |
properties_list = [] | |
try: | |
json_data = json.loads(cleaned_data) | |
search_results = json_data.get('searchResults').get('listResults', []) | |
for properties in search_results: | |
address = properties.get('addressWithZip') | |
property_info = properties.get('hdpData', {}).get('homeInfo') | |
city = property_info.get('city') | |
state = property_info.get('state') | |
postal_code = property_info.get('zipcode') | |
price = properties.get('price') | |
bedrooms = properties.get('beds') | |
bathrooms = properties.get('baths') | |
area = properties.get('area') | |
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft' | |
broker = properties.get('brokerName') | |
property_url = properties.get('detailUrl') | |
title = properties.get('statusText') | |
data = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
properties_list.append(data) | |
return properties_list | |
except ValueError: | |
print("Invalid json") | |
return None | |
def parse(zipcode, filter=None): | |
url = create_url(zipcode, filter) | |
response = get_response(url) | |
if not response: | |
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") | |
return None | |
parser = html.fromstring(response.text) | |
search_results = parser.xpath("//div[@id='search-results']//article") | |
if not search_results: | |
print("parsing from json data") | |
# identified as type 2 page | |
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()') | |
return get_data_from_json(raw_json_data) | |
print("parsing from html page") | |
properties_list = [] | |
for properties in search_results: | |
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") | |
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") | |
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") | |
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") | |
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") | |
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") | |
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") | |
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") | |
raw_title = properties.xpath(".//h4//text()") | |
address = clean(raw_address) | |
city = clean(raw_city) | |
state = clean(raw_state) | |
postal_code = clean(raw_postal_code) | |
price = clean(raw_price) | |
info = clean(raw_info).replace(u"\xb7", ',') | |
broker = clean(raw_broker_name) | |
title = clean(raw_title) | |
property_url = "https://www.zillow.com" + url[0] if url else None | |
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') | |
properties = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
if is_forsale: | |
properties_list.append(properties) | |
return properties_list | |
if __name__ == "__main__": | |
# Reading arguments | |
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) | |
argparser.add_argument('zipcode', help='') | |
sortorder_help = """ | |
available sort orders are : | |
newest : Latest property details, | |
cheapest : Properties with cheapest price | |
""" | |
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You') | |
args = argparser.parse_args() | |
zipcode = args.zipcode | |
sort = args.sort | |
print ("Fetching data for %s" % (zipcode)) | |
scraped_data = parse(zipcode, sort) | |
if scraped_data: | |
print ("Writing data to output file") | |
write_data_to_csv(scraped_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment