tbbooher · May 27, 2025 01:30
diff --git a/analyze_and_load_cl.py b/analyze_and_load_cl.py
 #!/usr/bin/env python3
 import json, re, unicodedata
 import psycopg2
 from datetime import datetime

 # ——— CONFIG ———
 DB_DSN      = "host=localhost port=5432 user=tim dbname=bike_prices"
 INPUT_FILE  = "raw_data.jsonl"
 SOURCE      = "craigslist"          # satisfies CHECK constraint

 # ——— Regex helpers ———
 year_rx       = re.compile(r"^(19|20)\d{2}\s+")
 frame_rx      = re.compile(r"\b(carbon|aluminum|steel|titanium|scandium)\b", re.I)
 wheel_rx      = re.compile(r"\b(29|27\.5|27|26)\b")
 drivetrain_rx = re.compile(r"(SLX|XT|XTR|Deore|GX|XX1)", re.I)
 size_rx       = re.compile(r"\b(XS|S|M|L|XL|XXL)\b", re.I)
 travel_rx     = re.compile(r"(\d{2,3})\s?mm", re.I)

 # ——— Known “real” MTB brands ———
 KNOWN_BRANDS = [
    "santa cruz","yt industries","specialized","trek","yeti","cannondale",
    "norco","pivot","canyon","giant","scott","salsa","commencal","kona",
    "juliana","polygon","marin","bmc","revel","privateer","devinci","allied",
    "alchemy","orbea","transition","fuji","felt","niner","ibis"
 ]
 # pre-sort once (longest first) so “santa cruz” matches before “santa”
 KNOWN_BRANDS = sorted(KNOWN_BRANDS, key=len, reverse=True)

 # ——— Field parsers ———
 def _ascii(txt: str) -> str:
    """strip accents & weird quotes so matching is easier"""
    return unicodedata.normalize("NFKD", txt).encode("ascii", "ignore").decode()

 def parse_brand_model(title: str):
    """return (brand, model) or (None, None) if brand not recognised"""
    t = _ascii(title).lower()
    t = year_rx.sub("", t).lstrip("-–— ").strip()     # drop leading year & dashes

    for brand in KNOWN_BRANDS:
        if t.startswith(brand):
            brand_title = brand.title()
            model_part  = t[len(brand):].lstrip(" -_,")
            # take up to next 4 tokens as “model” (purely heuristic)
            model_tokens = model_part.split()[:4]
            model = " ".join(model_tokens) if model_tokens else None
            return brand_title, model
    return None, None

 def parse_year(txt):             m = year_rx.search(txt) ; return int(m.group()) if m else None
 def parse_frame_material(txt):   m = frame_rx.search(txt) ; return m.group(1).lower() if m else None
 def parse_wheel_size(txt):       m = wheel_rx.search(txt) ; return m.group(1) if m else None
 def parse_drivetrain(txt):       m = drivetrain_rx.search(txt) ; return m.group(1) if m else None
 def parse_brake_type(txt):
    tl = txt.lower()
    if "hydraulic" in tl: return "hydraulic disc"
    if "disc"      in tl: return "mechanical disc"
    return None
 def parse_suspension(txt):
    for f in ("Fox","RockShox","Marzocchi"):
        if f.lower() in txt.lower(): return f
    return None
 def parse_tires(txt):
    for t in ("minion","maxxis","schwalbe","continental","michelin"):
        if t in txt.lower(): return t.title()
    return None
 def parse_size(txt):             m = size_rx.search(txt) ; return m.group(1).upper() if m else None
 def parse_travel(txt):           m = travel_rx.search(txt) ; return f"{m.group(1)}mm" if m else None

 # ——— DB ———
 conn = psycopg2.connect(DB_DSN)
 cur  = conn.cursor()

 with open(INPUT_FILE, encoding="utf-8") as fh:
    for line in fh:
        rec = json.loads(line)
        city        = rec["city"]
        scraped_at  = datetime.fromisoformat(rec["scraped_at"])
        prod        = rec["item"]
        offer       = prod.get("offers", {})

        title = prod.get("name", "").strip()
        if not title:
            continue

        brand, model = parse_brand_model(title)
        if brand is None:
            # skip obscure / unrecognised brands
            continue

        try:
            price = float(offer.get("price", "0"))
        except ValueError:
            price = None

        row = dict(
            lid   = prod.get("@id"),
            city  = city,
            scraped_at = scraped_at,
            price = price,
            curr  = offer.get("priceCurrency", "USD"),
            title = title,
            url   = prod.get("url", ""),
            loc   = offer.get("availableAtOrFrom", {})
                     .get("address", {})
                     .get("addressLocality"),
            year           = parse_year(title),
            brand          = brand,
            model          = model,
            frame_material = parse_frame_material(title),
            wheel_size     = parse_wheel_size(title),
            drivetrain     = parse_drivetrain(title),
            brake_type     = parse_brake_type(title),
            suspension     = parse_suspension(title),
            tire_brand     = parse_tires(title),
            frame_size     = parse_size(title),
            travel         = parse_travel(title),
            raw_json       = json.dumps(prod),
            source         = SOURCE
        )

        cur.execute("""
        INSERT INTO bike_listings (
          listing_id, city, post_date, price, currency,
          title, url, location, year,
          brand, model, frame_material, wheel_size,
          drivetrain, brake_type, suspension,
          tire_brand, frame_size, travel, raw_data, source
        ) VALUES (
          %(lid)s, %(city)s, %(scraped_at)s, %(price)s, %(curr)s,
          %(title)s, %(url)s, %(loc)s, %(year)s,
          %(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s,
          %(drivetrain)s, %(brake_type)s, %(suspension)s,
          %(tire_brand)s, %(frame_size)s, %(travel)s, %(raw_json)s, %(source)s
        )
        ON CONFLICT (listing_id) DO NOTHING;
        """, row)

 conn.commit()
 cur.close()
 conn.close()
 print("Analysis & import complete (only recognised brands kept).")
	#!/usr/bin/env python3
	import json, re, unicodedata
	import psycopg2
	from datetime import datetime

	# ——— CONFIG ———
	DB_DSN = "host=localhost port=5432 user=tim dbname=bike_prices"
	INPUT_FILE = "raw_data.jsonl"
	SOURCE = "craigslist" # satisfies CHECK constraint

	# ——— Regex helpers ———
	year_rx = re.compile(r"^(19\|20)\d{2}\s+")
	frame_rx = re.compile(r"\b(carbon\|aluminum\|steel\|titanium\|scandium)\b", re.I)
	wheel_rx = re.compile(r"\b(29\|27\.5\|27\|26)\b")
	drivetrain_rx = re.compile(r"(SLX\|XT\|XTR\|Deore\|GX\|XX1)", re.I)
	size_rx = re.compile(r"\b(XS\|S\|M\|L\|XL\|XXL)\b", re.I)
	travel_rx = re.compile(r"(\d{2,3})\s?mm", re.I)

	# ——— Known “real” MTB brands ———
	KNOWN_BRANDS = [
	"santa cruz","yt industries","specialized","trek","yeti","cannondale",
	"norco","pivot","canyon","giant","scott","salsa","commencal","kona",
	"juliana","polygon","marin","bmc","revel","privateer","devinci","allied",
	"alchemy","orbea","transition","fuji","felt","niner","ibis"
	]
	# pre-sort once (longest first) so “santa cruz” matches before “santa”
	KNOWN_BRANDS = sorted(KNOWN_BRANDS, key=len, reverse=True)

	# ——— Field parsers ———
	def _ascii(txt: str) -> str:
	"""strip accents & weird quotes so matching is easier"""
	return unicodedata.normalize("NFKD", txt).encode("ascii", "ignore").decode()

	def parse_brand_model(title: str):
	"""return (brand, model) or (None, None) if brand not recognised"""
	t = _ascii(title).lower()
	t = year_rx.sub("", t).lstrip("-–— ").strip() # drop leading year & dashes

	for brand in KNOWN_BRANDS:
	if t.startswith(brand):
	brand_title = brand.title()
	model_part = t[len(brand):].lstrip(" -_,")
	# take up to next 4 tokens as “model” (purely heuristic)
	model_tokens = model_part.split()[:4]
	model = " ".join(model_tokens) if model_tokens else None
	return brand_title, model
	return None, None

	def parse_year(txt): m = year_rx.search(txt) ; return int(m.group()) if m else None
	def parse_frame_material(txt): m = frame_rx.search(txt) ; return m.group(1).lower() if m else None
	def parse_wheel_size(txt): m = wheel_rx.search(txt) ; return m.group(1) if m else None
	def parse_drivetrain(txt): m = drivetrain_rx.search(txt) ; return m.group(1) if m else None
	def parse_brake_type(txt):
	tl = txt.lower()
	if "hydraulic" in tl: return "hydraulic disc"
	if "disc" in tl: return "mechanical disc"
	return None
	def parse_suspension(txt):
	for f in ("Fox","RockShox","Marzocchi"):
	if f.lower() in txt.lower(): return f
	return None
	def parse_tires(txt):
	for t in ("minion","maxxis","schwalbe","continental","michelin"):
	if t in txt.lower(): return t.title()
	return None
	def parse_size(txt): m = size_rx.search(txt) ; return m.group(1).upper() if m else None
	def parse_travel(txt): m = travel_rx.search(txt) ; return f"{m.group(1)}mm" if m else None

	# ——— DB ———
	conn = psycopg2.connect(DB_DSN)
	cur = conn.cursor()

	with open(INPUT_FILE, encoding="utf-8") as fh:
	for line in fh:
	rec = json.loads(line)
	city = rec["city"]
	scraped_at = datetime.fromisoformat(rec["scraped_at"])
	prod = rec["item"]
	offer = prod.get("offers", {})

	title = prod.get("name", "").strip()
	if not title:
	continue

	brand, model = parse_brand_model(title)
	if brand is None:
	# skip obscure / unrecognised brands
	continue

	try:
	price = float(offer.get("price", "0"))
	except ValueError:
	price = None

	row = dict(
	lid = prod.get("@id"),
	city = city,
	scraped_at = scraped_at,
	price = price,
	curr = offer.get("priceCurrency", "USD"),
	title = title,
	url = prod.get("url", ""),
	loc = offer.get("availableAtOrFrom", {})
	.get("address", {})
	.get("addressLocality"),
	year = parse_year(title),
	brand = brand,
	model = model,
	frame_material = parse_frame_material(title),
	wheel_size = parse_wheel_size(title),
	drivetrain = parse_drivetrain(title),
	brake_type = parse_brake_type(title),
	suspension = parse_suspension(title),
	tire_brand = parse_tires(title),
	frame_size = parse_size(title),
	travel = parse_travel(title),
	raw_json = json.dumps(prod),
	source = SOURCE
	)

	cur.execute("""
	INSERT INTO bike_listings (
	listing_id, city, post_date, price, currency,
	title, url, location, year,
	brand, model, frame_material, wheel_size,
	drivetrain, brake_type, suspension,
	tire_brand, frame_size, travel, raw_data, source
	) VALUES (
	%(lid)s, %(city)s, %(scraped_at)s, %(price)s, %(curr)s,
	%(title)s, %(url)s, %(loc)s, %(year)s,
	%(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s,
	%(drivetrain)s, %(brake_type)s, %(suspension)s,
	%(tire_brand)s, %(frame_size)s, %(travel)s, %(raw_json)s, %(source)s
	)
	ON CONFLICT (listing_id) DO NOTHING;
	""", row)

	conn.commit()
	cur.close()
	conn.close()
	print("Analysis & import complete (only recognised brands kept).")