-
-
Save scrapehero/5f51f344d68cf2c022eb2d23a2f1cf95 to your computer and use it in GitHub Desktop.
from lxml import html | |
import requests | |
import unicodecsv as csv | |
import argparse | |
import json | |
def clean(text): | |
if text: | |
return ' '.join(' '.join(text).split()) | |
return None | |
def get_headers(): | |
# Creating headers. | |
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'accept-encoding': 'gzip, deflate, sdch, br', | |
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4', | |
'cache-control': 'max-age=0', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} | |
return headers | |
def create_url(zipcode, filter): | |
# Creating Zillow URL based on the filter. | |
if filter == "newest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode) | |
elif filter == "cheapest": | |
url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode) | |
else: | |
url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode) | |
print(url) | |
return url | |
def save_to_file(response): | |
# saving response to `response.html` | |
with open("response.html", 'w') as fp: | |
fp.write(response.text) | |
def write_data_to_csv(data): | |
# saving scraped data to csv. | |
with open("properties-%s.csv" % (zipcode), 'wb') as csvfile: | |
fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in data: | |
writer.writerow(row) | |
def get_response(url): | |
# Getting response from zillow.com. | |
for i in range(5): | |
response = requests.get(url, headers=get_headers()) | |
print("status code received:", response.status_code) | |
if response.status_code != 200: | |
# saving response to file for debugging purpose. | |
save_to_file(response) | |
continue | |
else: | |
save_to_file(response) | |
return response | |
return None | |
def get_data_from_json(raw_json_data): | |
# getting data from json (type 2 of their A/B testing page) | |
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "") | |
properties_list = [] | |
try: | |
json_data = json.loads(cleaned_data) | |
search_results = json_data.get('searchResults').get('listResults', []) | |
for properties in search_results: | |
address = properties.get('addressWithZip') | |
property_info = properties.get('hdpData', {}).get('homeInfo') | |
city = property_info.get('city') | |
state = property_info.get('state') | |
postal_code = property_info.get('zipcode') | |
price = properties.get('price') | |
bedrooms = properties.get('beds') | |
bathrooms = properties.get('baths') | |
area = properties.get('area') | |
info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft' | |
broker = properties.get('brokerName') | |
property_url = properties.get('detailUrl') | |
title = properties.get('statusText') | |
data = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
properties_list.append(data) | |
return properties_list | |
except ValueError: | |
print("Invalid json") | |
return None | |
def parse(zipcode, filter=None): | |
url = create_url(zipcode, filter) | |
response = get_response(url) | |
if not response: | |
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") | |
return None | |
parser = html.fromstring(response.text) | |
search_results = parser.xpath("//div[@id='search-results']//article") | |
if not search_results: | |
print("parsing from json data") | |
# identified as type 2 page | |
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()') | |
return get_data_from_json(raw_json_data) | |
print("parsing from html page") | |
properties_list = [] | |
for properties in search_results: | |
raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()") | |
raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()") | |
raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") | |
raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()") | |
raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()") | |
raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()") | |
raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()") | |
url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href") | |
raw_title = properties.xpath(".//h4//text()") | |
address = clean(raw_address) | |
city = clean(raw_city) | |
state = clean(raw_state) | |
postal_code = clean(raw_postal_code) | |
price = clean(raw_price) | |
info = clean(raw_info).replace(u"\xb7", ',') | |
broker = clean(raw_broker_name) | |
title = clean(raw_title) | |
property_url = "https://www.zillow.com" + url[0] if url else None | |
is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]') | |
properties = {'address': address, | |
'city': city, | |
'state': state, | |
'postal_code': postal_code, | |
'price': price, | |
'facts and features': info, | |
'real estate provider': broker, | |
'url': property_url, | |
'title': title} | |
if is_forsale: | |
properties_list.append(properties) | |
return properties_list | |
if __name__ == "__main__": | |
# Reading arguments | |
argparser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) | |
argparser.add_argument('zipcode', help='') | |
sortorder_help = """ | |
available sort orders are : | |
newest : Latest property details, | |
cheapest : Properties with cheapest price | |
""" | |
argparser.add_argument('sort', nargs='?', help=sortorder_help, default='Homes For You') | |
args = argparser.parse_args() | |
zipcode = args.zipcode | |
sort = args.sort | |
print ("Fetching data for %s" % (zipcode)) | |
scraped_data = parse(zipcode, sort) | |
if scraped_data: | |
print ("Writing data to output file") | |
write_data_to_csv(scraped_data) |
Can someone clue me in as to why the results always cap at 200? Is Zillow limiting the return? Seems no matter what zipcode I try, only 200 results ever come back. No errors or anything.
200 is not the result count. Its the status code received from zillow.com. The script will fetch only first page results.
any way of getting all the pages for the given result?
awesome code ,,,,
plz let me know how to fetch "contactPhone"
or what is the contactphone's XPATH ?
[rob@rawbdorable zillow] (master)$ python3 zillow.py 02126
Fetching data for 02126
https://www.zillow.com/homes/for_sale/02126_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy
status code received: 200
parsing from json data
Traceback (most recent call last):
File "zillow.py", line 185, in
scraped_data = parse(zipcode, sort)
File "zillow.py", line 129, in parse
return get_data_from_json(raw_json_data)
File "zillow.py", line 74, in get_data_from_json
cleaned_data = clean(raw_json_data).replace('", "")
AttributeError: 'NoneType' object has no attribute 'replace'
I'm having the same problem as robstryker. Been a Node.js dev for about a year, but first time doing scraping.
My issues are on lines 123/128:
search_results = parser.xpath("//div[@id='search-results']//article")
raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
Both of these are returning empty arrays. I can see from viewing the html that there isn't a 'search-results' div, but there is a 'grid-search-results'. Editing this has not changed the result though, always an empty array. Similarly, I can find 'mobileSeachPageStore' in the html. In both places, I see the text we are trying to filter down to (address, price, etc.).
I'm not looking at the generated 'response.html' when you run the command though, this is me manually saving the webpage as html.
I added a few print statements, for response.text, search_results, and raw_json_data below. My next thought is the response mentions recaptcha, and has a header with text saying to verify that I'm a human to continue. I am more likely to think I'm doing something wrong, rather than the method is suddenly blocked though.
Did some more research while I was composing this, I noticed that in their robots.txt there is a disallow line for /homes/*_rb, which seems to be exactly what we are doing. Would that be a sufficient reason or am I reading too much into this?
PS E:\Projects\Zillow> python zillow.py 02126 newest
Fetching data for 02126
https://www.zillow.com/homes/for_sale/02126/0_singlestory/days_sort
status code received: 200
parser <html><head><meta name="robots" content="noindex, nofollow"/><script src="https://www.google.com/recaptcha/api.js"></script><link href="https://www.zillowstatic.com/vstatic/80d5e73/static/css/z-pages/captcha.css" type="text/css" rel="stylesheet" media="screen"/><script>
function handleCaptcha(response) {
var vid = getQueryString("vid"); // getQueryString is implemented below
var uuid = getQueryString("uuid");
var name = '_pxCaptcha';
var cookieValue = btoa(JSON.stringify({r:response,v:vid,u:uuid}));
var cookieParts = [name, '=', cookieValue, '; path=/'];
cookieParts.push('; domain=' + window.location.hostname);
cookieParts.push('; max-age=10');//expire after 10 seconds
document.cookie = cookieParts.join('');
var originalURL = getOriginalUrl("url");
var originalHost = window.location.host;
var newHref = window.location.protocol + "//" + originalHost;
originalURL = originalURL || '/';
newHref = newHref + originalURL;
window.location.href = newHref;
}
function getQueryString(name, url) {
if (!url) url = window.location.href;
name = name.replace(/[\[\]]/g, "\\$&");
var regex = new RegExp("[?&]" + name + "(=([^&#]*)|&|#|$)"),
results = regex.exec(url);
if (!results) return null;
if (!results[2]) return '';
return decodeURIComponent(results[2].replace(/\+/g, " "));
}
function getOriginalUrl(name) {
var url = getQueryString(name);
if (!url) return null;
var regExMatcher = new RegExp("(([^&#@]*)|&|#|$)");
var matches = regExMatcher.exec(url);
if (!matches) return null;
return matches[0];
}
document.addEventListener("DOMContentLoaded", function (e) {
var uuidVerifyRegExp = /^\{?[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\}?$/i;
document.getElementById("uuid").innerText = "UUID: " + uuidVerifyRegExp.exec(getQueryString("uuid"));
});
</script></head><body><main class="zsg-layout-content"><div class="error-content-block"><div class="error-text-content"><!-- <h1>Captcha</h1> --><h5>Please verify you're a human to continue.</h5><div id="content" class="captcha-container"><div class="g-recaptcha" data-theme="white" data-callback="handleCaptcha" data-sitekey="6Lcj-R8TAAAAABs3FrRPuQhLMbp5QrHsHufzLf7b"></div></div><img src="https://www.zillowstatic.com/static/logos/logo-65x14.png" width="65" alt="Zillow" height="14"></img></div></div></main><h4 id="uuid" class="uuid-string zsg-fineprint"></h4></body></html><!-- H:028 T:17ms S:2686 R:Sun Jul 07 16:21:49 PDT 2019 B:5.0.61033-master.e804620~delivery_ready.cd00c91 -->
search_results []
parsing from json data
raw json data []
Traceback (most recent call last):
File "zillow.py", line 188, in <module>
scraped_data = parse(zipcode, sort)
File "zillow.py", line 132, in parse
return get_data_from_json(raw_json_data)
File "zillow.py", line 74, in get_data_from_json
cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
AttributeError: 'NoneType' object has no attribute 'replace'
@kyle-moeller, any update?
Here is how to fix the code:
Add this line to the beginning import section:
from urllib.request import Request, urlopen
Then rewrite part of the "parse" function with the below code:
def parse(zipcode, filter=None):
url = create_url(zipcode, filter)
response = get_response(url)
if not response:
print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
return None
# These two new lines are added
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
#replace the parser to take input added above
#parser = html.fromstring(response.text)
parser = html.fromstring(webpage)
Hi there - how can I make this iterate through more pages?
Here is how to fix the code:
Add this line to the beginning import section:
from urllib.request import Request, urlopen
Then rewrite part of the "parse" function with the below code:
def parse(zipcode, filter=None): url = create_url(zipcode, filter) response = get_response(url) if not response: print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.") return None # These two new lines are added req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() #replace the parser to take input added above #parser = html.fromstring(response.text) parser = html.fromstring(webpage)
Thank you! It works out!!!
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
Hi there - how can I make this iterate through more pages?
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
Has anyone been able to figure out how to iterate this through all pages?
Can someone help me with this error?
%run scraped.py 92115 newest
Fetching data for 92115
https://www.zillow.com/homes/for_sale/92115/0_singlestory/days_sort
status code received: 200
Traceback (most recent call last):
File "C:\Users\user\Desktop\scraped.py", line 185, in
scraped_data = parse(zipcode, sort)
File "C:\Users\user\Desktop\scraped.py", line 116, in parse
response = get_response(url)
File "C:\Users\user\Desktop\scraped.py", line 67, in get_response
save_to_file(response)
File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file
fp.write(response.text)
File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
Hi friends. Has anyone helped you with this?
File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to
I think adding a UTF-8 parameter will fix this. Take a look at this stack overflow link. It worked for me.
can anyone help me with this error?
https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'
replace this
search_results = json_data.get('searchResults').get('listResults', [])
with this
search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
I can't get past captcha, anyone have any ideas?
I gave up on this, and instead used the Zillow to Excel plugin with Chromium. It is not completely automated, but was sufficient to get the data that I wanted with a few minutes of effort (for sale and sold for several zip code regions). https://chrome.google.com/webstore/detail/zillow-to-excel/aecdekdgjlncaadbdiciepplaobhcjgi?hl=en
Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here!
I want to make two modifications: scrape all results and adding additional filters. I already made the corrections above and have code working. If anyone wants to help send me a message.
can anyone help me with this error?
https://www.zillow.com/homes/for_sale/98075/0_singlestory/days_sort
status code received: 200
parsing from json data
Traceback (most recent call last):
File "C:/Users/Downloads/zillow.py", line 191, in
scraped_data = parse(zipcode, sort)
File "C:/Users/Downloads/zillow.py", line 135, in parse
return get_data_from_json(raw_json_data)
File "C:/Users/Downloads/zillow.py", line 80, in get_data_from_json
search_results = json_data.get('searchResults').get('listResults', [])
AttributeError: 'NoneType' object has no attribute 'get'
Any luck? I'm getting the same error.
Thanks for all the comments above! I fix the code based on your comments and it works!
I modify two parts:
First, replace the row 81:
the wrong version: search_results = json_data.get('searchResults').get('listResults', [])
the correct version: search_results = json_data.get('cat1').get('searchResults').get('listResults', [])
Second, import io, and then:
replace "with open("response.html", 'w') as fp:
" with:
with io.open("response.html", 'w', encoding="utf-8") as fp:
Notice, to run the code, run "python zillow.py 60615
" in the terminal, where 60615
is an example of zip code.
And if you want to scrape Zillow data of multiple zip codes, use the following:
`run_cmd = 'python zillow.py '
for zc in zip_code_lst:
z_run_cmd = run_cmd + zc
os.system(z_run_cmd)`
where zip_code_lst
is a list of zip codes.
I was able to get past the captcha changing the headers:
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.8',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
Can someone help me with this error?
%run scraped.py 92115 newest
Fetching data for 92115
https://www.zillow.com/homes/for_sale/92115/0_singlestory/days_sort
status code received: 200
Traceback (most recent call last):File "C:\Users\user\Desktop\scraped.py", line 185, in
scraped_data = parse(zipcode, sort)File "C:\Users\user\Desktop\scraped.py", line 116, in parse
response = get_response(url)File "C:\Users\user\Desktop\scraped.py", line 67, in get_response
save_to_file(response)File "C:\Users\user\Desktop\scraped.py", line 42, in save_to_file
fp.write(response.text)File "C:\Users\user\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u0100' in position 29381: character maps to
You need to write with binary mode i.e.
with open(YOURFILE, "b") as f:
f.write(SOMETHING)
Looks like Zillow has been blocking almost all kinds of automated requests. Have tried multiple user agents, and it lets me create a soup once, but not iteratively. If someone finds a solution, please feel free to share it here!
???
Hi can someone help me, i keep getting this error code
usage: zillow.py [-h] zipcode [sort]
zillow.py: error: the following arguments are required: zipcode
200 is not the result count. Its the status code received from zillow.com. The script will fetch only first page results.