Last active
          November 11, 2019 17:24 
        
      - 
      
 - 
        
Save benwtrent/a8f83d2ccf01eb515004280d42fe9310 to your computer and use it in GitHub Desktop.  
    building out avg price prediction on a house given ashville air bnb listing data
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | { | |
| "listings-ash" : { | |
| "aliases" : { }, | |
| "mappings" : { | |
| "_meta" : { | |
| "created_by" : "ml-file-data-visualizer" | |
| }, | |
| "properties" : { | |
| "@timestamp" : { | |
| "type" : "date" | |
| }, | |
| "access" : { | |
| "type" : "text" | |
| }, | |
| "accommodates" : { | |
| "type" : "long" | |
| }, | |
| "amenities" : { | |
| "type" : "text" | |
| }, | |
| "availability_30" : { | |
| "type" : "long" | |
| }, | |
| "availability_365" : { | |
| "type" : "long" | |
| }, | |
| "availability_60" : { | |
| "type" : "long" | |
| }, | |
| "availability_90" : { | |
| "type" : "long" | |
| }, | |
| "bathrooms" : { | |
| "type" : "double" | |
| }, | |
| "bed_type" : { | |
| "type" : "keyword" | |
| }, | |
| "bedrooms" : { | |
| "type" : "long" | |
| }, | |
| "beds" : { | |
| "type" : "long" | |
| }, | |
| "calculated_host_listings_count" : { | |
| "type" : "long" | |
| }, | |
| "calculated_host_listings_count_entire_homes" : { | |
| "type" : "long" | |
| }, | |
| "calculated_host_listings_count_private_rooms" : { | |
| "type" : "long" | |
| }, | |
| "calculated_host_listings_count_shared_rooms" : { | |
| "type" : "long" | |
| }, | |
| "calendar_last_scraped" : { | |
| "type" : "date", | |
| "format" : "iso8601" | |
| }, | |
| "calendar_updated" : { | |
| "type" : "keyword" | |
| }, | |
| "cancellation_policy" : { | |
| "type" : "keyword" | |
| }, | |
| "city" : { | |
| "type" : "keyword" | |
| }, | |
| "cleaning_fee" : { | |
| "type" : "double" | |
| }, | |
| "country" : { | |
| "type" : "keyword" | |
| }, | |
| "country_code" : { | |
| "type" : "keyword" | |
| }, | |
| "description" : { | |
| "type" : "text" | |
| }, | |
| "experiences_offered" : { | |
| "type" : "keyword" | |
| }, | |
| "extra_people" : { | |
| "type" : "double" | |
| }, | |
| "first_review" : { | |
| "type" : "date", | |
| "format" : "iso8601" | |
| }, | |
| "guests_included" : { | |
| "type" : "long" | |
| }, | |
| "has_availability" : { | |
| "type" : "keyword" | |
| }, | |
| "host_about" : { | |
| "type" : "text" | |
| }, | |
| "host_acceptance_rate" : { | |
| "type" : "keyword" | |
| }, | |
| "host_has_profile_pic" : { | |
| "type" : "keyword" | |
| }, | |
| "host_id" : { | |
| "type" : "long" | |
| }, | |
| "host_identity_verified" : { | |
| "type" : "keyword" | |
| }, | |
| "host_is_superhost" : { | |
| "type" : "keyword" | |
| }, | |
| "host_listings_count" : { | |
| "type" : "long" | |
| }, | |
| "host_location" : { | |
| "type" : "keyword" | |
| }, | |
| "host_name" : { | |
| "type" : "keyword" | |
| }, | |
| "host_neighbourhood" : { | |
| "type" : "keyword" | |
| }, | |
| "host_picture_url" : { | |
| "type" : "keyword" | |
| }, | |
| "host_response_rate" : { | |
| "type" : "keyword" | |
| }, | |
| "host_response_time" : { | |
| "type" : "keyword" | |
| }, | |
| "host_since" : { | |
| "type" : "date", | |
| "format" : "iso8601" | |
| }, | |
| "host_thumbnail_url" : { | |
| "type" : "keyword" | |
| }, | |
| "host_total_listings_count" : { | |
| "type" : "long" | |
| }, | |
| "host_url" : { | |
| "type" : "keyword" | |
| }, | |
| "host_verifications" : { | |
| "type" : "text" | |
| }, | |
| "house_rules" : { | |
| "type" : "text" | |
| }, | |
| "instant_bookable" : { | |
| "type" : "keyword" | |
| }, | |
| "interaction" : { | |
| "type" : "text" | |
| }, | |
| "is_business_travel_ready" : { | |
| "type" : "keyword" | |
| }, | |
| "is_location_exact" : { | |
| "type" : "keyword" | |
| }, | |
| "jurisdiction_names" : { | |
| "type" : "keyword" | |
| }, | |
| "last_review" : { | |
| "type" : "date", | |
| "format" : "iso8601" | |
| }, | |
| "last_scraped" : { | |
| "type" : "date", | |
| "format" : "iso8601" | |
| }, | |
| "latitude" : { | |
| "type" : "double" | |
| }, | |
| "listing_id" : { | |
| "type" : "long" | |
| }, | |
| "listing_url" : { | |
| "type" : "keyword" | |
| }, | |
| "longitude" : { | |
| "type" : "double" | |
| }, | |
| "market" : { | |
| "type" : "keyword" | |
| }, | |
| "maximum_maximum_nights" : { | |
| "type" : "long" | |
| }, | |
| "maximum_minimum_nights" : { | |
| "type" : "long" | |
| }, | |
| "maximum_nights" : { | |
| "type" : "long" | |
| }, | |
| "maximum_nights_avg_ntm" : { | |
| "type" : "double" | |
| }, | |
| "minimum_maximum_nights" : { | |
| "type" : "long" | |
| }, | |
| "minimum_minimum_nights" : { | |
| "type" : "long" | |
| }, | |
| "minimum_nights" : { | |
| "type" : "long" | |
| }, | |
| "minimum_nights_avg_ntm" : { | |
| "type" : "double" | |
| }, | |
| "monthly_price" : { | |
| "type" : "double" | |
| }, | |
| "name" : { | |
| "type" : "text" | |
| }, | |
| "neighborhood_overview" : { | |
| "type" : "text" | |
| }, | |
| "neighbourhood_cleansed" : { | |
| "type" : "long" | |
| }, | |
| "notes" : { | |
| "type" : "text" | |
| }, | |
| "number_of_reviews" : { | |
| "type" : "long" | |
| }, | |
| "number_of_reviews_ltm" : { | |
| "type" : "long" | |
| }, | |
| "picture_url" : { | |
| "type" : "keyword" | |
| }, | |
| "price" : { | |
| "type" : "double" | |
| }, | |
| "property_type" : { | |
| "type" : "keyword" | |
| }, | |
| "require_guest_phone_verification" : { | |
| "type" : "keyword" | |
| }, | |
| "require_guest_profile_picture" : { | |
| "type" : "keyword" | |
| }, | |
| "requires_license" : { | |
| "type" : "keyword" | |
| }, | |
| "review_scores_accuracy" : { | |
| "type" : "long" | |
| }, | |
| "review_scores_checkin" : { | |
| "type" : "long" | |
| }, | |
| "review_scores_cleanliness" : { | |
| "type" : "long" | |
| }, | |
| "review_scores_communication" : { | |
| "type" : "long" | |
| }, | |
| "review_scores_location" : { | |
| "type" : "long" | |
| }, | |
| "review_scores_rating" : { | |
| "type" : "long" | |
| }, | |
| "review_scores_value" : { | |
| "type" : "long" | |
| }, | |
| "reviews_per_month" : { | |
| "type" : "double" | |
| }, | |
| "room_type" : { | |
| "type" : "keyword" | |
| }, | |
| "scrape_id" : { | |
| "type" : "date", | |
| "format" : "yyyyMMddHHmmss" | |
| }, | |
| "security_deposit" : { | |
| "type" : "double" | |
| }, | |
| "smart_location" : { | |
| "type" : "keyword" | |
| }, | |
| "space" : { | |
| "type" : "text" | |
| }, | |
| "square_feet" : { | |
| "type" : "long" | |
| }, | |
| "state" : { | |
| "type" : "keyword" | |
| }, | |
| "street" : { | |
| "type" : "text" | |
| }, | |
| "summary" : { | |
| "type" : "text" | |
| }, | |
| "transit" : { | |
| "type" : "text" | |
| }, | |
| "weekly_price" : { | |
| "type" : "double" | |
| }, | |
| "zipcode" : { | |
| "type" : "long" | |
| } | |
| } | |
| }, | |
| "settings" : { | |
| "index" : { | |
| "creation_date" : "1572547796565", | |
| "number_of_shards" : "1", | |
| "number_of_replicas" : "1", | |
| "uuid" : "HplZ3Q8qTai7kGpm0e-C5A", | |
| "version" : { | |
| "created" : "7040199" | |
| }, | |
| "provided_name" : "listings-ash" | |
| } | |
| } | |
| }, | |
| "listings-ash-calendar" : { | |
| "aliases" : { }, | |
| "mappings" : { | |
| "_meta" : { | |
| "created_by" : "ml-file-data-visualizer" | |
| }, | |
| "properties" : { | |
| "@timestamp" : { | |
| "type" : "date" | |
| }, | |
| "adjusted_price" : { | |
| "type" : "double" | |
| }, | |
| "available" : { | |
| "type" : "keyword" | |
| }, | |
| "date" : { | |
| "type" : "date", | |
| "format" : "iso8601" | |
| }, | |
| "listing_id" : { | |
| "type" : "long" | |
| }, | |
| "maximum_nights" : { | |
| "type" : "long" | |
| }, | |
| "minimum_nights" : { | |
| "type" : "long" | |
| }, | |
| "price" : { | |
| "type" : "double" | |
| } | |
| } | |
| }, | |
| "settings" : { | |
| "index" : { | |
| "creation_date" : "1572548275257", | |
| "number_of_shards" : "1", | |
| "number_of_replicas" : "1", | |
| "uuid" : "jrU9jPUARlWFrB0i6YjaTQ", | |
| "version" : { | |
| "created" : "7040199" | |
| }, | |
| "provided_name" : "listings-ash-calendar" | |
| } | |
| } | |
| }, | |
| "listings-ash-review" : { | |
| "aliases" : { }, | |
| "mappings" : { | |
| "_meta" : { | |
| "created_by" : "ml-file-data-visualizer" | |
| }, | |
| "properties" : { | |
| "@timestamp" : { | |
| "type" : "date" | |
| }, | |
| "comments" : { | |
| "type" : "text" | |
| }, | |
| "date" : { | |
| "type" : "date", | |
| "format" : "iso8601" | |
| }, | |
| "listing_id" : { | |
| "type" : "long" | |
| }, | |
| "review_id" : { | |
| "type" : "long" | |
| }, | |
| "reviewer_id" : { | |
| "type" : "long" | |
| }, | |
| "reviewer_name" : { | |
| "type" : "keyword" | |
| } | |
| } | |
| }, | |
| "settings" : { | |
| "index" : { | |
| "creation_date" : "1572548566924", | |
| "number_of_shards" : "1", | |
| "number_of_replicas" : "1", | |
| "uuid" : "px9n2sDvSPmU0cIDQHmXaA", | |
| "version" : { | |
| "created" : "7040199" | |
| }, | |
| "provided_name" : "listings-ash-review" | |
| } | |
| } | |
| } | |
| } | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | ####### | |
| # Preview transform merge of 3 indices on a common listing_id field | |
| ####### | |
| POST _data_frame/transforms/_preview | |
| { | |
| "source": { | |
| "index": [ | |
| "listings-calendars-ash", | |
| "listings-reviews-ash", | |
| "listings-ash-verbose-1" | |
| ] | |
| }, | |
| "pivot": { | |
| "group_by": { | |
| "listing_id": { | |
| "terms": { | |
| "field": "listing_id" | |
| } | |
| } | |
| }, | |
| "aggregations": { | |
| "max_price": { | |
| "max": { | |
| "field": "price" | |
| } | |
| }, | |
| "min_price": { | |
| "min": { | |
| "field": "price" | |
| } | |
| }, | |
| "avg_price": { | |
| "avg": { | |
| "field": "price" | |
| } | |
| }, | |
| "num_reviews": { | |
| "cardinality": { | |
| "field": "id" | |
| } | |
| }, | |
| "num_bedrooms": { | |
| "max": { | |
| "field": "bedrooms" | |
| } | |
| }, | |
| "num_beds": { | |
| "max": { | |
| "field": "beds" | |
| } | |
| }, | |
| "security_deposit": { | |
| "max": { | |
| "field": "security_deposit" | |
| } | |
| }, | |
| "num_bathrooms": { | |
| "max": { | |
| "field": "bathrooms" | |
| } | |
| }, | |
| "accommodates": { | |
| "max": { | |
| "field": "accommodates" | |
| } | |
| }, | |
| "location_geo": { | |
| "geo_centroid": { | |
| "field": "location_geo" | |
| } | |
| }, | |
| "review_score_rating": { | |
| "avg": { | |
| "field": "review_scores_rating" | |
| } | |
| }, | |
| "description": { | |
| "scripted_metric": { | |
| "init_script": "state.description = null", | |
| "map_script": "state.description = params._source.description", | |
| "combine_script": "return state.description", | |
| "reduce_script": "for (d in states) if (d != null) return d" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| ####### | |
| # Build and start transform | |
| ####### | |
| PUT _data_frame/transforms/listings-ash-pivot | |
| { | |
| "source": { | |
| "index": [ | |
| "listings-calendars-ash", | |
| "listings-reviews-ash", | |
| "listings-ash-verbose-1" | |
| ] | |
| }, | |
| "dest": { | |
| "index": "listings-ash-pivot" | |
| }, | |
| "pivot": { | |
| "group_by": { | |
| "listing_id": { | |
| "terms": { | |
| "field": "listing_id" | |
| } | |
| } | |
| }, | |
| "aggregations": { | |
| "max_price": { | |
| "max": { | |
| "field": "price" | |
| } | |
| }, | |
| "min_price": { | |
| "min": { | |
| "field": "price" | |
| } | |
| }, | |
| "avg_price": { | |
| "avg": { | |
| "field": "price" | |
| } | |
| }, | |
| "num_reviews": { | |
| "cardinality": { | |
| "field": "id" | |
| } | |
| }, | |
| "num_bedrooms": { | |
| "max": { | |
| "field": "bedrooms" | |
| } | |
| }, | |
| "num_beds": { | |
| "max": { | |
| "field": "beds" | |
| } | |
| }, | |
| "security_deposit": { | |
| "max": { | |
| "field": "security_deposit" | |
| } | |
| }, | |
| "num_bathrooms": { | |
| "max": { | |
| "field": "bathrooms" | |
| } | |
| }, | |
| "accommodates": { | |
| "max": { | |
| "field": "accommodates" | |
| } | |
| }, | |
| "location_geo": { | |
| "geo_centroid": { | |
| "field": "location_geo" | |
| } | |
| }, | |
| "review_score_rating": { | |
| "avg": { | |
| "field": "review_scores_rating" | |
| } | |
| }, | |
| "description": { | |
| "scripted_metric": { | |
| "init_script": "state.description = null", | |
| "map_script": "state.description = params._source.description", | |
| "combine_script": "return state.description", | |
| "reduce_script": "for (d in states) if (d != null) return d" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| POST _data_frame/transforms/listings-ash-pivot/_start | |
| GET _data_frame/transforms/listings-ash-pivot/_stats | |
| #### | |
| # Build outlier detection to weed out outliers on the pivoted data | |
| #### | |
| PUT _ml/data_frame/analytics/listings-ash-pivot-outliers | |
| { | |
| "source": { | |
| "index": [ | |
| "listings-ash-pivot" | |
| ] | |
| }, | |
| "dest": { | |
| "index": "listings-ash-pivot-outliers", | |
| "results_field": "ml" | |
| }, | |
| "analysis": { | |
| "outlier_detection": {} | |
| }, | |
| "analyzed_fields": { | |
| "excludes": [ | |
| "listing_id" | |
| ] | |
| } | |
| } | |
| ####### | |
| # Add a new document with my desired Air BnB parameters given our pivoted params | |
| ####### | |
| POST listings-ash-pivot-outliers/_doc | |
| { | |
| "num_bathrooms" : 2.0, | |
| "num_beds" : 4.0, | |
| "review_score_rating" : 90.0, | |
| "max_price" : 270.0, | |
| "min_price" : 100.0, | |
| "accommodates" : 4.0, | |
| "num_reviews" : 20.0, | |
| "num_bedrooms" : 2.0, | |
| "ml": { | |
| "outlier_score": 0.0 | |
| } | |
| } | |
| ###### | |
| # Build regression job to predict the average price given our pivoted features | |
| # dependent_variable = predicted field "avg_price" | |
| ###### | |
| PUT _ml/data_frame/analytics/listings-ash-pivot-outliers-pred | |
| { | |
| "source": { | |
| "index": [ | |
| "listings-ash-pivot-outliers" | |
| ], | |
| "query": { | |
| "range": { | |
| "ml.outlier_score": { | |
| "lt": 0.6 | |
| } | |
| } | |
| } | |
| }, | |
| "dest": { | |
| "index": "listings-ash-pivot-pred", | |
| "results_field": "ml_regression" | |
| }, | |
| "analysis": { | |
| "regression": { | |
| "dependent_variable": "avg_price" | |
| } | |
| }, | |
| "analyzed_fields": { | |
| "includes": [ | |
| "num_bathrooms", | |
| "num_beds", | |
| "avg_price", | |
| "review_score_rating", | |
| "max_price", | |
| "min_price", | |
| "accommodates", | |
| "num_reviews", | |
| "num_bedrooms" | |
| ] | |
| } | |
| } | |
| GET listings-ash-pivot-pred/_search | |
| { | |
| "query": { | |
| "term": { | |
| "ml_regression.is_training": { | |
| "value": false | |
| } | |
| } | |
| } | |
| } | |
| ####### | |
| # How accurate is our regression model? | |
| ####### | |
| POST _ml/data_frame/_evaluate | |
| { | |
| "index": "listings-ash-pivot-pred", | |
| "query": { | |
| "term": { | |
| "ml_regression.is_training": { | |
| "value": true | |
| } | |
| } | |
| }, | |
| "evaluation": { | |
| "regression": { | |
| "actual_field": "avg_price", | |
| "predicted_field": "ml_regression.avg_price_prediction", | |
| "metrics": { | |
| "r_squared": {}, | |
| "mean_squared_error": {} | |
| } | |
| } | |
| } | |
| } | |
| ####### | |
| # On average how far are we off? | |
| ####### | |
| POST /_scripts/painless/_execute | |
| { | |
| "script": { | |
| "source": "Math.sqrt(params.number)", | |
| "params": { | |
| "number": 348.4186538329173 | |
| } | |
| } | |
| } | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
Data is available here, it is a combination of the last years calendar data, listing details, and reviews: http://insideairbnb.com/get-the-data.html
I added all the data via the CSV uploader.