Created
May 27, 2025 01:20
-
-
Save tbbooher/8ed24d4731464d8988c6e03320042578 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import re | |
| import json | |
| import psycopg2 | |
| from bs4 import BeautifulSoup | |
| # ——— CONFIG ——— | |
| DB_DSN = "host=localhost port=5432 user=tim dbname=bike_prices" | |
| CITY = "Fort Worth" | |
| INPUT_FILE = "fortworth.html" | |
| DRY_RUN = False | |
| SOURCE = "facebook" | |
| # ——— KNOWN BRANDS ——— | |
| # order matters: check "santa cruz" before "santa" or "cruz" | |
| KNOWN_BRANDS = [ | |
| "santa cruz", "specialized", "trek", "yt industries", "yeti", "cannondale", | |
| "norco", "pivot", "canyon", "giant", "scott", "salsa", "commencal", "kona", | |
| "juliana", "polygon", "marin", "bmc", "revel", "privateer", "devinci", | |
| "allied", "alchemy", "orbea", "transition", "fuji", "felt", "niner", "ibis" | |
| ] | |
| # ——— Helpers to extract specs ——— | |
| year_rx = re.compile(r"^(19|20)\d{2}\b") | |
| def strip_leading_year(s): | |
| return re.sub(r"^(19|20)\d{2}\s+", "", s) | |
| def parse_brand_model(title): | |
| t = title.lower().strip() | |
| # 1) drop a leading year | |
| t_no_year = strip_leading_year(t) | |
| # 2) try to match a known brand | |
| for b in KNOWN_BRANDS: | |
| if t_no_year.startswith(b): | |
| brand = b.title() | |
| model = t_no_year[len(b):].strip() or None | |
| return brand, model | |
| # 3) fallback: split words | |
| parts = t_no_year.split() | |
| brand = parts[0].title() if parts else None | |
| model = " ".join(parts[1:3]) if len(parts) >= 2 else None | |
| return brand, model | |
| def clean_text(v): | |
| if not isinstance(v, str): | |
| return v | |
| return " ".join(v.split()) | |
| def parse_frame_material(title): | |
| for mat in ("carbon","aluminum","steel","titanium","scandium"): | |
| if mat in title.lower(): return mat | |
| return None | |
| def parse_wheel_size(title): | |
| for size in ("29","27.5","27","26"): | |
| if size in title: return size | |
| return None | |
| def parse_drivetrain(title): | |
| m = re.search(r"(SLX|XT|XTR|Deore|GX|XX1)[\w\s\-]*", title, re.I) | |
| return m.group().strip() if m else None | |
| def parse_brake_type(title): | |
| if "hydraulic" in title.lower(): return "hydraulic disc" | |
| if "disc" in title.lower(): return "mechanical disc" | |
| return None | |
| def parse_suspension(title): | |
| for fork in ("Fox","RockShox","Marzocchi"): | |
| if fork.lower() in title.lower(): return fork | |
| return None | |
| def parse_tires(title): | |
| for t in ("minion","maxxis","schwalbe","continental","michelin"): | |
| if t in title.lower(): return t.title() | |
| return None | |
| def parse_size(title): | |
| m = re.search(r"\b(XS|S|M|L|XL|XXL)\b", title, re.I) | |
| return m.group().upper() if m else None | |
| def parse_travel(title): | |
| m = re.search(r"(\d{2,3})\s?mm", title.lower()) | |
| return f"{m.group(1)}mm" if m else None | |
| def parse_year(title): | |
| m = re.search(r"\b(19|20)\d{2}\b", title) | |
| return int(m.group()) if m else None | |
| # ——— Load HTML ——— | |
| with open(INPUT_FILE, encoding="utf-8") as f: | |
| soup = BeautifulSoup(f, "html.parser") | |
| # ——— Connect to DB (unless dry-run) ——— | |
| if not DRY_RUN: | |
| conn = psycopg2.connect(DB_DSN) | |
| cur = conn.cursor() | |
| # ——— Extract, clean, display or insert ——— | |
| for a in soup.find_all("a", href=re.compile(r"/marketplace/item/")): | |
| href = a["href"] | |
| match_id = re.search(r"/item/(\d+)", href) | |
| listing_id = clean_text(match_id.group(1)) if match_id else None | |
| # each listing has three info-blocks: price, title, location | |
| blocks = a.select("div.x1gslohp") | |
| price_txt = ( | |
| blocks[0].select_one("span[dir='auto']").get_text(strip=True) | |
| if len(blocks) > 0 else "" | |
| ) | |
| title = ( | |
| blocks[1].select_one("span[dir='auto']").get_text(strip=True) | |
| if len(blocks) > 1 else None | |
| ) | |
| location = ( | |
| blocks[2].select_one("span[dir='auto']").get_text(strip=True) | |
| if len(blocks) > 2 else None | |
| ) | |
| # parse numeric price | |
| try: | |
| price = float(price_txt.replace("$","").replace(",","")) | |
| except: | |
| price = None | |
| if not title: | |
| print(f"Skipping listing {listing_id} due to missing title") | |
| continue | |
| # parse brand/model | |
| brand, model = parse_brand_model(title) | |
| record = { | |
| "listing_id": listing_id, | |
| "city": CITY, | |
| "price": price, | |
| "currency": "USD", | |
| "title": clean_text(title), | |
| "url": href, | |
| "location": clean_text(location), | |
| "year": parse_year(title), | |
| "brand": brand, | |
| "model": model, | |
| "frame_material": parse_frame_material(title), | |
| "wheel_size": parse_wheel_size(title), | |
| "drivetrain": parse_drivetrain(title), | |
| "brake_type": parse_brake_type(title), | |
| "suspension": parse_suspension(title), | |
| "tire_brand": parse_tires(title), | |
| "frame_size": parse_size(title), | |
| "travel": parse_travel(title), | |
| "raw_data": json.dumps(str(a)), | |
| "source": SOURCE, | |
| } | |
| if DRY_RUN: | |
| # print(json.dumps(record, indent=2)) | |
| print("Title:", record["title"]) | |
| print("Price:", record["price"]) | |
| print("Location:", record["location"]) | |
| print("Brand:", record["brand"]) | |
| print("Model:", record["model"]) | |
| else: | |
| cur.execute(""" | |
| INSERT INTO bike_listings ( | |
| listing_id, city, price, currency, | |
| title, url, location, | |
| year, brand, model, frame_material, wheel_size, | |
| drivetrain, brake_type, suspension, | |
| tire_brand, frame_size, travel, | |
| raw_data, source | |
| ) VALUES ( | |
| %(listing_id)s, %(city)s, %(price)s, %(currency)s, | |
| %(title)s, %(url)s, %(location)s, | |
| %(year)s, %(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s, | |
| %(drivetrain)s, %(brake_type)s, %(suspension)s, | |
| %(tire_brand)s, %(frame_size)s, %(travel)s, | |
| %(raw_data)s, %(source)s | |
| ) ON CONFLICT (listing_id) DO NOTHING; | |
| """, record) | |
| # ——— Commit & close if not dry-run ——— | |
| if not DRY_RUN: | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| print("Dry run complete" if DRY_RUN else f"Imported to DB from {INPUT_FILE}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment