Skip to content

Instantly share code, notes, and snippets.

@ranchodeluxe
Last active January 25, 2025 21:27
Show Gist options
  • Save ranchodeluxe/643b6a1452a4cb4b16023c3c7032ba88 to your computer and use it in GitHub Desktop.
Save ranchodeluxe/643b6a1452a4cb4b16023c3c7032ba88 to your computer and use it in GitHub Desktop.
from postal.expand import expand_address
import sys
import xxhash
import pandas as pd
from sqlalchemy import create_engine, text
from datetime import datetime, timedelta
DATABASE_URI = "postgresql://kaiizenadmin:[email protected]/kaiizen"
engine = create_engine(DATABASE_URI)
def match_address_expansion(address):
expansions = expand_address(address)
expansion_hashes = [xxhash.xxh64(ex).intdigest() for ex in expansions]
query = """
SELECT *
FROM properties_address_expansions
WHERE expansion_hash = ANY(%(expansion_hashes)s)
"""
matches = pd.read_sql(query, engine, params={"expansion_hashes": expansion_hashes})
return matches
def get_matching_cherre_record(address_expansions):
"""
:param address_expansions:
:return:
"""
if address_expansions.loc[0].dwellsy_pk is None:
query = """
SELECT
DISTINCT(pae.cherre_pk),
ps.pk AS cherre_pk,
ps.addr AS properties_addr,
ps.addr_city AS properties_city,
ps.addr_state AS properties_state,
ps.addr_zip AS properties_zip,
ps.last_sale_price
FROM
properties_address_expansions pae
LEFT JOIN
properties_simple ps ON pae.cherre_pk = ps.pk
WHERE
pae.cherre_pk = %(cherre_pk)s
"""
comps = pd.read_sql(
query,
engine,
params={
"cherre_pk": int(address_expansions.loc[0].cherre_pk)
}
)
else:
query = """
SELECT
DISTINCT(pae.cherre_pk),
ps.pk AS cherre_pk,
ps.addr AS properties_addr,
ps.addr_city AS properties_city,
ps.addr_state AS properties_state,
ps.addr_zip AS properties_zip,
ps.last_sale_price,
ls.pk AS dwellsy_pk,
# ls.addr AS listings_addr,
# ls.addr_city AS listings_city,
# ls.addr_state AS listings_state,
# ls.addr_zip AS listings_zip,
ls.listing_amount,
ls.listing_deposit
FROM
properties_address_expansions pae
LEFT JOIN
properties_simple ps ON pae.cherre_pk = ps.pk
LEFT JOIN
listings_simple ls ON pae.dwellsy_pk = ls.pk
WHERE
pae.dwellsy_pk = %(dwellsy_pk)s AND
pae.cherre_pk = %(cherre_pk)s
"""
comps = pd.read_sql(
query,
engine,
params={
"dwellsy_pk": int(address_expansions.loc[0].dwellsy_pk),
"cherre_pk": int(address_expansions.loc[0].cherre_pk)
}
)
return comps
def find_comparables(dw_id: int):
"""
"""
with engine.connect() as connection:
# 1. Find Subject Property
subject_query = text("""
SELECT
pk,
last_sale_price AS "lastSalePrice",
last_sale_date AS "lastSaleDate",
bed_count AS "bedCount",
bath_count AS "bathCount",
year_built AS "yearBuilt",
sqft_lot_size AS "sqftLotSize",
addr_city AS city,
msa_name AS msa,
ST_X(coords) AS longitude,
ST_Y(coords) AS latitude
FROM properties_simple
WHERE pk = :dw_id
""")
subject_property = pd.read_sql(subject_query, connection, params={"dw_id": str(dw_id)}).iloc[0]
# 2.
is_rural = is_rural_area(subject_property["city"], subject_property["msa"])
proximity_distance_meters = 5 * 1609.34 if is_rural else 5 * 1609.34
# 3.
min_sale_price = subject_property["lastSalePrice"] * 0.85
max_sale_price = subject_property["lastSalePrice"] * 1.1
min_sale_date = datetime.now() - timedelta(days=712)
min_beds = (subject_property["bedCount"] or 1) - 1
max_beds = (subject_property["bedCount"] or 1) + 1
min_baths = (subject_property["bathCount"] or 1) - 1
max_baths = (subject_property["bathCount"] or 1) + 1
min_year_built = (subject_property["yearBuilt"] or 1700) - 5
max_year_built = (subject_property["yearBuilt"] or 1700) + 5
min_sqft = (subject_property["sqftLotSize"] or 0) - 200
max_sqft = (subject_property["sqftLotSize"] or 0) + 200
# 4.
comparables_query = text("""
SELECT
ps.pk AS "cherreId",
ps.addr AS "streetAddr",
ps.addr_city AS "city",
ps.addr_state AS "stateCode",
ps.addr_zip AS "zip",
ps.last_sale_price AS "propertyLastSalePrice",
ps.last_sale_date AS "propertyLastSaleDate",
ps.bed_count AS "propertyBedCount",
ps.bath_count AS "propertyBathCount",
ps.sqft_building AS "propertySqft",
ps.year_built AS "propertyYearBuilt",
ST_X(ps.coords) AS "propertyLongitude",
ST_Y(ps.coords) AS "propertyLatitude",
ST_DistanceSphere(
ps.coords,
ST_SetSRID(ST_MakePoint(:longitude, :latitude), 4326)
) / 1609.34 AS "distanceMiles"
FROM properties_simple ps
WHERE
ps.last_sale_price BETWEEN :min_sale_price AND :max_sale_price
AND ps.last_sale_date >= :min_sale_date
AND ps.bed_count BETWEEN :min_beds AND :max_beds
AND ps.bath_count BETWEEN :min_baths AND :max_baths
AND ps.year_built BETWEEN :min_year_built AND :max_year_built
AND ps.sqft_building BETWEEN :min_sqft AND :max_sqft
AND ST_DistanceSphere(
ps.coords,
ST_SetSRID(ST_MakePoint(:longitude, :latitude), 4326)
) <= :proximity_distance_meters
ORDER BY ps.pk, "distanceMiles"
LIMIT 50
""")
comparables = pd.read_sql(comparables_query, connection, params={
"longitude": subject_property["longitude"],
"latitude": subject_property["latitude"],
"min_sale_price": min_sale_price,
"max_sale_price": max_sale_price,
"min_sale_date": min_sale_date,
"min_beds": min_beds,
"max_beds": max_beds,
"min_baths": min_baths,
"max_baths": max_baths,
"min_year_built": min_year_built,
"max_year_built": max_year_built,
"min_sqft": min_sqft,
"max_sqft": max_sqft,
"proximity_distance_meters": proximity_distance_meters
})
# 5.
cherre_ids = comparables["cherreId"].tolist()
if not cherre_ids:
raise ValueError("No comparables found for subject property")
listings_query = text("""
SELECT DISTINCT ON (pae.cherre_pk)
pae.cherre_pk AS "cherreId",
ls.pk AS "dwellsyId",
ls.listing_amount AS "listingAmount",
ls.listing_deposit AS "listingDeposit"
FROM properties_address_expansions pae
INNER JOIN listings_simple ls ON pae.dwellsy_pk = ls.pk
WHERE pae.cherre_pk = ANY(:cherre_ids)
""")
listings = pd.read_sql(listings_query, connection, params={"cherre_ids": cherre_ids})
# 6. Merge comparables with listings
merged_data = pd.merge(comparables, listings, how="left", on="cherreId")
return merged_data.to_dict(orient="records")
def is_rural_area(city_name: str, msa_name: str) -> bool:
"""
Determines if a given city is considered rural.
:param city_name: The name of the city.
:param msa_name: The name of the Metropolitan Statistical Area (MSA).
:return: True if rural, False otherwise.
"""
return msa_name is None
if __name__ == "__main__":
all_address = """4942 W Novak Way Laveen AZ 85339
1466 E Avenida Fresca Casa Grande AZ 85122
1609 Carter Rd. Decatur GA 30032
2485 Hillside ave. Decatur GA 30032
1427 Beaverton Ave Cincinnati OH 45237
14875 Arrowhead Dr Yukon OK 73099
3430 Rocky Springs Ct Marietta GA 30062
950 Sugar Meadow Dr Sugar Hill GA 30518
5114 Crestway Dr, La Porte TX 77571
10415 Gloria St Gibsonton FL 33534
2859 Alexandria Dr SW Atlanta GA 30331""".split("\n")
for addr in all_address:
print(f"############################")
print(f"[ ADDRESS ]: {addr}")
expansions = match_address_expansion(addr)
print(expansions)
if not expansions.empty:
try:
comps = find_comparables(expansions.loc[0].cherre_pk)
print("f[ COMPS FOUND ]:")
print(comps)
except ValueError:
print(f"[ ERROR ]: no comps found for subject property: '{addr.lstrip()}'")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment