Skip to content

Instantly share code, notes, and snippets.

@tbbooher
Created May 27, 2025 01:30
Show Gist options
  • Save tbbooher/1dc32c76c30ef7d49ce52c6de9e1f5f6 to your computer and use it in GitHub Desktop.
Save tbbooher/1dc32c76c30ef7d49ce52c6de9e1f5f6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import json, re, unicodedata
import psycopg2
from datetime import datetime
# ——— CONFIG ———
DB_DSN = "host=localhost port=5432 user=tim dbname=bike_prices"
INPUT_FILE = "raw_data.jsonl"
SOURCE = "craigslist" # satisfies CHECK constraint
# ——— Regex helpers ———
year_rx = re.compile(r"^(19|20)\d{2}\s+")
frame_rx = re.compile(r"\b(carbon|aluminum|steel|titanium|scandium)\b", re.I)
wheel_rx = re.compile(r"\b(29|27\.5|27|26)\b")
drivetrain_rx = re.compile(r"(SLX|XT|XTR|Deore|GX|XX1)", re.I)
size_rx = re.compile(r"\b(XS|S|M|L|XL|XXL)\b", re.I)
travel_rx = re.compile(r"(\d{2,3})\s?mm", re.I)
# ——— Known “real” MTB brands ———
KNOWN_BRANDS = [
"santa cruz","yt industries","specialized","trek","yeti","cannondale",
"norco","pivot","canyon","giant","scott","salsa","commencal","kona",
"juliana","polygon","marin","bmc","revel","privateer","devinci","allied",
"alchemy","orbea","transition","fuji","felt","niner","ibis"
]
# pre-sort once (longest first) so “santa cruz” matches before “santa”
KNOWN_BRANDS = sorted(KNOWN_BRANDS, key=len, reverse=True)
# ——— Field parsers ———
def _ascii(txt: str) -> str:
"""strip accents & weird quotes so matching is easier"""
return unicodedata.normalize("NFKD", txt).encode("ascii", "ignore").decode()
def parse_brand_model(title: str):
"""return (brand, model) or (None, None) if brand not recognised"""
t = _ascii(title).lower()
t = year_rx.sub("", t).lstrip("-–— ").strip() # drop leading year & dashes
for brand in KNOWN_BRANDS:
if t.startswith(brand):
brand_title = brand.title()
model_part = t[len(brand):].lstrip(" -_,")
# take up to next 4 tokens as “model” (purely heuristic)
model_tokens = model_part.split()[:4]
model = " ".join(model_tokens) if model_tokens else None
return brand_title, model
return None, None
def parse_year(txt): m = year_rx.search(txt) ; return int(m.group()) if m else None
def parse_frame_material(txt): m = frame_rx.search(txt) ; return m.group(1).lower() if m else None
def parse_wheel_size(txt): m = wheel_rx.search(txt) ; return m.group(1) if m else None
def parse_drivetrain(txt): m = drivetrain_rx.search(txt) ; return m.group(1) if m else None
def parse_brake_type(txt):
tl = txt.lower()
if "hydraulic" in tl: return "hydraulic disc"
if "disc" in tl: return "mechanical disc"
return None
def parse_suspension(txt):
for f in ("Fox","RockShox","Marzocchi"):
if f.lower() in txt.lower(): return f
return None
def parse_tires(txt):
for t in ("minion","maxxis","schwalbe","continental","michelin"):
if t in txt.lower(): return t.title()
return None
def parse_size(txt): m = size_rx.search(txt) ; return m.group(1).upper() if m else None
def parse_travel(txt): m = travel_rx.search(txt) ; return f"{m.group(1)}mm" if m else None
# ——— DB ———
conn = psycopg2.connect(DB_DSN)
cur = conn.cursor()
with open(INPUT_FILE, encoding="utf-8") as fh:
for line in fh:
rec = json.loads(line)
city = rec["city"]
scraped_at = datetime.fromisoformat(rec["scraped_at"])
prod = rec["item"]
offer = prod.get("offers", {})
title = prod.get("name", "").strip()
if not title:
continue
brand, model = parse_brand_model(title)
if brand is None:
# skip obscure / unrecognised brands
continue
try:
price = float(offer.get("price", "0"))
except ValueError:
price = None
row = dict(
lid = prod.get("@id"),
city = city,
scraped_at = scraped_at,
price = price,
curr = offer.get("priceCurrency", "USD"),
title = title,
url = prod.get("url", ""),
loc = offer.get("availableAtOrFrom", {})
.get("address", {})
.get("addressLocality"),
year = parse_year(title),
brand = brand,
model = model,
frame_material = parse_frame_material(title),
wheel_size = parse_wheel_size(title),
drivetrain = parse_drivetrain(title),
brake_type = parse_brake_type(title),
suspension = parse_suspension(title),
tire_brand = parse_tires(title),
frame_size = parse_size(title),
travel = parse_travel(title),
raw_json = json.dumps(prod),
source = SOURCE
)
cur.execute("""
INSERT INTO bike_listings (
listing_id, city, post_date, price, currency,
title, url, location, year,
brand, model, frame_material, wheel_size,
drivetrain, brake_type, suspension,
tire_brand, frame_size, travel, raw_data, source
) VALUES (
%(lid)s, %(city)s, %(scraped_at)s, %(price)s, %(curr)s,
%(title)s, %(url)s, %(loc)s, %(year)s,
%(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s,
%(drivetrain)s, %(brake_type)s, %(suspension)s,
%(tire_brand)s, %(frame_size)s, %(travel)s, %(raw_json)s, %(source)s
)
ON CONFLICT (listing_id) DO NOTHING;
""", row)
conn.commit()
cur.close()
conn.close()
print("Analysis & import complete (only recognised brands kept).")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment