-
-
Save bhubbard/ae8870c40ecf9b369dacff15df3cca70 to your computer and use it in GitHub Desktop.
Quick helper script for mining zillow data / commute times
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
quick script put together to scrape zillow listing data | |
at the very least it functions so you don't need to go back and forth to calculate commute times | |
very messy... maybe i'll revisit this someday | |
""" | |
import requests | |
import re | |
import csv | |
from pyzillow import pyzillow | |
from bs4 import BeautifulSoup | |
from gmaps import Geocoding,Directions | |
import json | |
# AUTH FOR ZILLOW | |
zwsid = "zillow_id" | |
zillow_data = pyzillow.ZillowWrapper(zwsid) | |
# Initialize a directions class for Google Maps | |
api2 = Directions() | |
""" | |
or construct URL based on different parameters... | |
beds = | |
bath = | |
town = | |
state = | |
building = | |
min_rent = | |
max_rent = | |
... | |
""" | |
dest_1 = "42.463441, -71.266918" # location 1 | |
dest_2 ="42.74625, -71.181849" # location 2 | |
# Sample URL | |
url = "http://www.zillow.com/homes/for_rent/Cambridge-MA/house,condo,apartment_duplex,townhouse_type/3934_rid/2-_beds/1-_baths/0-757142_price/0-2800_mp/days_sort/42.396016,-71.035795,42.360764,-71.188574_rect/12_zm/") | |
def extractID(to_parse): | |
""" just for cleanliness""" | |
address = re.findall("/homedetails/(.*)/[1-9]",to_parse)[0].replace("-"," ")[:-5] | |
zipcode = re.findall("/homedetails/(.*)/[1-9]",to_parse)[0].replace("-","")[-5:] | |
return (str(address),str(zipcode)) | |
def downURL(url): | |
""" downloads individual listing links for apts matching search criteria""" | |
# append list of results to id_list | |
id_list = [] | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text) | |
results = soup.findAll("a", {"class" : "hdp-link routable"}) | |
for i in range(len(results)): | |
# [id_list.append(extractID(str(results[i]))) for i in range(len(results))] | |
return id_list | |
def getTravelDuration(route): | |
""" calculate commute time""" | |
for i in route: | |
jsonstring=json.dumps(i) | |
parsed_route = json.loads(jsonstring) | |
# parse through JSON to get lat,lng and time at each step of the way | |
for step in parsed_route['legs'][0]['steps']: | |
lat = step['start_location']['lat'] | |
duration = step['duration']['value'] + duration | |
return duration * 60 # duration is in seconds | |
def getZillowData(add): | |
""" create a dictionary of property data """ | |
property_data = {} | |
property_details = zillow_data.get_deep_search_results(add[0], add[1]) | |
result = pyzillow.GetDeepSearchResults(property_details) | |
property_data['zillow_id'] = result.zillow_id | |
property_data['lat'] = result.latitude | |
property_data['lon'] = result.longitude | |
property_data['size'] = result.longitude | |
property_data['bath'] = result.longitude | |
property_data['bed'] = result.longitude | |
property_data['details'] = result.home_detail_link | |
# Extract Price | |
price_getter = requests.get(result.home_detail_link) | |
soup = BeautifulSoup(price_getter.text) | |
parse_me = soup.findAll("div", {"class" : "main-row home-summary-row"}) | |
property_data['price'] = re.findall("\$[^a-z],\d\d\d",price)[0] | |
# Set route parameters | |
route1 = api2.directions(add,dest_1,mode="driving") | |
route2 = api2.directions(add,dest_2,mode="transit") | |
# Get commute times | |
property_data['dist_1'] = getTravelDuration(route1) | |
property_data['dist_2'] = getTravelDuration(route2) | |
return property_data | |
def toCSV(url): | |
""" write out results to CSV """ | |
prop_list = downURL(url) | |
properties= [] | |
# Get data for each property | |
for i in prop_list: | |
properties.append(getZillowData(i)) | |
keys = properties[0].keys() | |
# Write to CSV file | |
with open('filename.csv', 'wb') as outfile: | |
dict_writer = csv.DictWriter(outfile, keys) | |
dict_writer.writeheader() | |
dict_writer.writerows(properties) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment