Created
May 27, 2025 01:30
-
-
Save tbbooher/1dc32c76c30ef7d49ce52c6de9e1f5f6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import json, re, unicodedata | |
| import psycopg2 | |
| from datetime import datetime | |
| # ——— CONFIG ——— | |
| DB_DSN = "host=localhost port=5432 user=tim dbname=bike_prices" | |
| INPUT_FILE = "raw_data.jsonl" | |
| SOURCE = "craigslist" # satisfies CHECK constraint | |
| # ——— Regex helpers ——— | |
| year_rx = re.compile(r"^(19|20)\d{2}\s+") | |
| frame_rx = re.compile(r"\b(carbon|aluminum|steel|titanium|scandium)\b", re.I) | |
| wheel_rx = re.compile(r"\b(29|27\.5|27|26)\b") | |
| drivetrain_rx = re.compile(r"(SLX|XT|XTR|Deore|GX|XX1)", re.I) | |
| size_rx = re.compile(r"\b(XS|S|M|L|XL|XXL)\b", re.I) | |
| travel_rx = re.compile(r"(\d{2,3})\s?mm", re.I) | |
| # ——— Known “real” MTB brands ——— | |
| KNOWN_BRANDS = [ | |
| "santa cruz","yt industries","specialized","trek","yeti","cannondale", | |
| "norco","pivot","canyon","giant","scott","salsa","commencal","kona", | |
| "juliana","polygon","marin","bmc","revel","privateer","devinci","allied", | |
| "alchemy","orbea","transition","fuji","felt","niner","ibis" | |
| ] | |
| # pre-sort once (longest first) so “santa cruz” matches before “santa” | |
| KNOWN_BRANDS = sorted(KNOWN_BRANDS, key=len, reverse=True) | |
| # ——— Field parsers ——— | |
| def _ascii(txt: str) -> str: | |
| """strip accents & weird quotes so matching is easier""" | |
| return unicodedata.normalize("NFKD", txt).encode("ascii", "ignore").decode() | |
| def parse_brand_model(title: str): | |
| """return (brand, model) or (None, None) if brand not recognised""" | |
| t = _ascii(title).lower() | |
| t = year_rx.sub("", t).lstrip("-–— ").strip() # drop leading year & dashes | |
| for brand in KNOWN_BRANDS: | |
| if t.startswith(brand): | |
| brand_title = brand.title() | |
| model_part = t[len(brand):].lstrip(" -_,") | |
| # take up to next 4 tokens as “model” (purely heuristic) | |
| model_tokens = model_part.split()[:4] | |
| model = " ".join(model_tokens) if model_tokens else None | |
| return brand_title, model | |
| return None, None | |
| def parse_year(txt): m = year_rx.search(txt) ; return int(m.group()) if m else None | |
| def parse_frame_material(txt): m = frame_rx.search(txt) ; return m.group(1).lower() if m else None | |
| def parse_wheel_size(txt): m = wheel_rx.search(txt) ; return m.group(1) if m else None | |
| def parse_drivetrain(txt): m = drivetrain_rx.search(txt) ; return m.group(1) if m else None | |
| def parse_brake_type(txt): | |
| tl = txt.lower() | |
| if "hydraulic" in tl: return "hydraulic disc" | |
| if "disc" in tl: return "mechanical disc" | |
| return None | |
| def parse_suspension(txt): | |
| for f in ("Fox","RockShox","Marzocchi"): | |
| if f.lower() in txt.lower(): return f | |
| return None | |
| def parse_tires(txt): | |
| for t in ("minion","maxxis","schwalbe","continental","michelin"): | |
| if t in txt.lower(): return t.title() | |
| return None | |
| def parse_size(txt): m = size_rx.search(txt) ; return m.group(1).upper() if m else None | |
| def parse_travel(txt): m = travel_rx.search(txt) ; return f"{m.group(1)}mm" if m else None | |
| # ——— DB ——— | |
| conn = psycopg2.connect(DB_DSN) | |
| cur = conn.cursor() | |
| with open(INPUT_FILE, encoding="utf-8") as fh: | |
| for line in fh: | |
| rec = json.loads(line) | |
| city = rec["city"] | |
| scraped_at = datetime.fromisoformat(rec["scraped_at"]) | |
| prod = rec["item"] | |
| offer = prod.get("offers", {}) | |
| title = prod.get("name", "").strip() | |
| if not title: | |
| continue | |
| brand, model = parse_brand_model(title) | |
| if brand is None: | |
| # skip obscure / unrecognised brands | |
| continue | |
| try: | |
| price = float(offer.get("price", "0")) | |
| except ValueError: | |
| price = None | |
| row = dict( | |
| lid = prod.get("@id"), | |
| city = city, | |
| scraped_at = scraped_at, | |
| price = price, | |
| curr = offer.get("priceCurrency", "USD"), | |
| title = title, | |
| url = prod.get("url", ""), | |
| loc = offer.get("availableAtOrFrom", {}) | |
| .get("address", {}) | |
| .get("addressLocality"), | |
| year = parse_year(title), | |
| brand = brand, | |
| model = model, | |
| frame_material = parse_frame_material(title), | |
| wheel_size = parse_wheel_size(title), | |
| drivetrain = parse_drivetrain(title), | |
| brake_type = parse_brake_type(title), | |
| suspension = parse_suspension(title), | |
| tire_brand = parse_tires(title), | |
| frame_size = parse_size(title), | |
| travel = parse_travel(title), | |
| raw_json = json.dumps(prod), | |
| source = SOURCE | |
| ) | |
| cur.execute(""" | |
| INSERT INTO bike_listings ( | |
| listing_id, city, post_date, price, currency, | |
| title, url, location, year, | |
| brand, model, frame_material, wheel_size, | |
| drivetrain, brake_type, suspension, | |
| tire_brand, frame_size, travel, raw_data, source | |
| ) VALUES ( | |
| %(lid)s, %(city)s, %(scraped_at)s, %(price)s, %(curr)s, | |
| %(title)s, %(url)s, %(loc)s, %(year)s, | |
| %(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s, | |
| %(drivetrain)s, %(brake_type)s, %(suspension)s, | |
| %(tire_brand)s, %(frame_size)s, %(travel)s, %(raw_json)s, %(source)s | |
| ) | |
| ON CONFLICT (listing_id) DO NOTHING; | |
| """, row) | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| print("Analysis & import complete (only recognised brands kept).") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment