Created
April 19, 2018 01:11
-
-
Save camsaul/2b637f9baf0906c69c7b306b845d7c74 to your computer and use it in GitHub Desktop.
Generating real fake addresses
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns addr | |
(:require [cheshire.core :as json] | |
[clj-http.client :as http] | |
[clojure.java | |
[io :as io] | |
[jdbc :as jdbc]] | |
[clojure.pprint :as pprint] | |
[metabase.db :as mdb] | |
[metabase.util.schema :as su] | |
[schema.core :as s])) | |
;; Generating Random Addresses | |
;; | |
;; 1. Get Shapefile of US @ https://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_nation_5m.zip | |
;; | |
;; 2. Create a PostGIS-enabled Postgres DB. I named mine 'america'; modify the `db-conn` details below to name it | |
;; something else | |
;; | |
;; CREATE EXTENSION postgis; | |
;; | |
;; 3. Load shape file into DB. | |
;; | |
;; shp2pgsql -s SRID /path/to/cb_2017_us_nation_5m/cb_2017_us_nation_5m.shp | psql -d america -U cam | |
;; | |
;; 4. Generate Google API key(s) and set them as `google-api-keys` below. The code automatically randomly chooses | |
;; between keys for each request. One key should get you around between 500 and 1000 addresses before hitting the | |
;; daily limit; if you plan to generate more than this, make sure you generate multiple keys. | |
;; | |
;; 5. In a REPL run `(generate-and-save-addresses!)` | |
(def ^:private google-api-keys | |
["<PUT YOUR API KEYS HERE>"]) | |
(def ^:private dest-file "sample_dataset/metabase/sample_dataset/addresses_2.edn") | |
(def ^:private num-addresses-to-generate 1000) | |
(def db-conn | |
{:classname "org.postgresql.Driver" ; must be in classpath | |
:subprotocol "postgresql" | |
:subname (str "//localhost:5432/america?OpenSourceSubProtocolOverride=true")}) | |
(defonce connection-pool | |
(mdb/connection-pool db-conn)) | |
(defn- valid-coordinate? [[lat lon]] | |
(-> (jdbc/query | |
connection-pool | |
["SELECT ST_Contains(usa.geom, ST_Point(?, ?)) FROM cb_2017_us_nation_5m usa" | |
lon | |
lat]) | |
first | |
:st_contains)) | |
(def ^:private upper-left [71.0 -170.0]) | |
(def ^:private lower-right [25.1 -62.4]) | |
(defn- rand-point-in-range [x y] | |
(let [minn (min x y) | |
maxx (max x y) | |
rang (- maxx minn)] | |
(+ (* (rand) rang) | |
minn))) | |
(defn- maybe-valid-random-coordinate [] | |
(let [lat (rand-point-in-range (first upper-left) (first lower-right)) | |
lon (rand-point-in-range (second upper-left) (second lower-right))] | |
[lat lon])) | |
(defn- valid-random-coordinate [] | |
(let [random-coordinate (maybe-valid-random-coordinate)] | |
(if (valid-coordinate? random-coordinate) | |
random-coordinate | |
(recur)))) | |
(defn- valid-zip-string? [s] | |
(and | |
(= (count s) 5) | |
(let [n (Integer/parseInt s)] | |
(< 0 n 100000)))) | |
(def ^:private Address | |
{:lat Double | |
:lon Double | |
:house-number su/NonBlankString | |
:street su/NonBlankString | |
:city su/NonBlankString | |
:state-abbrev (s/enum "MP" "WI" "SC" "MN" "NV" "NM" "NE" "AK" "NH" "ME" | |
"NY" "TN" "FL" "IA" "GA" "IL" "RI" "GU" "VA" "PR" | |
"MI" "PA" "UT" "AP" "WY" "SD" "MO" "KY" "CT" "AR" | |
"ID" "DC" "FM" "MA" "OK" "AL" "VT" "MS" "CA" "LA" | |
"DE" "WA" "AS" "AA" "KS" "MD" "ND" "MH" "TX" "VI" | |
"OR" "NC" "AZ" "IN" "WV" "CO" "HI" "AE" "MT" "NJ" | |
"PW" "OH") | |
:zip (s/constrained su/NonBlankString valid-zip-string?)}) | |
(s/defn ^:private normalize-address :- Address | |
[{:keys [lat lon], :as addr}] | |
{:lat lat | |
:lon lon | |
:house-number (get-in addr [:street_number :long_name]) | |
:street (get-in addr [:route :long_name]) | |
:city (get-in addr [:locality :long_name]) | |
:state-abbrev (get-in addr [:administrative_area_level_1 :short_name]) | |
:zip (get-in addr [:postal_code :short_name])}) | |
(defn- maybe-normalize-address [addr] | |
(try (normalize-address addr) | |
(catch Throwable _))) | |
(defn- get-first-result [{[{components :address_components, {{:keys [lat lng]} :location} :geometry}] :results | |
:keys [error_message]}] | |
(when (seq error_message) | |
(throw (Exception. (str error_message)))) | |
(into {:lat lat, :lon lng} | |
(for [component components] | |
[(keyword (first (:types component))) (dissoc component :types)]))) | |
(defn- reverse-geocode-coordinate [[lat lon]] | |
(-> (http/get (format "https://maps.googleapis.com/maps/api/geocode/json?latlng=%f,%f&key=%s" | |
lat lon (rand-nth google-api-keys))) | |
:body | |
(json/parse-string keyword) | |
get-first-result | |
maybe-normalize-address)) | |
(defn- valid-random-coordinate-with-address [] | |
(or (-> (valid-random-coordinate) | |
reverse-geocode-coordinate) | |
(recur))) | |
(defn- generate-addresses [] | |
(for [_ (range num-addresses-to-generate)] | |
(valid-random-coordinate-with-address))) | |
(defn- generate-and-save-addresses! [] | |
(let [addresses (generate-addresses)] | |
(binding [pprint/*print-right-margin* 120] | |
(with-open [writer (io/writer (io/file dest-file))] | |
(pprint/pprint addresses writer))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment