Skip to content

Instantly share code, notes, and snippets.

@tbbooher
Created May 27, 2025 01:20
Show Gist options
  • Save tbbooher/8ed24d4731464d8988c6e03320042578 to your computer and use it in GitHub Desktop.
Save tbbooher/8ed24d4731464d8988c6e03320042578 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
import json
import psycopg2
from bs4 import BeautifulSoup
# ——— CONFIG ———
DB_DSN = "host=localhost port=5432 user=tim dbname=bike_prices"
CITY = "Fort Worth"
INPUT_FILE = "fortworth.html"
DRY_RUN = False
SOURCE = "facebook"
# ——— KNOWN BRANDS ———
# order matters: check "santa cruz" before "santa" or "cruz"
KNOWN_BRANDS = [
"santa cruz", "specialized", "trek", "yt industries", "yeti", "cannondale",
"norco", "pivot", "canyon", "giant", "scott", "salsa", "commencal", "kona",
"juliana", "polygon", "marin", "bmc", "revel", "privateer", "devinci",
"allied", "alchemy", "orbea", "transition", "fuji", "felt", "niner", "ibis"
]
# ——— Helpers to extract specs ———
year_rx = re.compile(r"^(19|20)\d{2}\b")
def strip_leading_year(s):
return re.sub(r"^(19|20)\d{2}\s+", "", s)
def parse_brand_model(title):
t = title.lower().strip()
# 1) drop a leading year
t_no_year = strip_leading_year(t)
# 2) try to match a known brand
for b in KNOWN_BRANDS:
if t_no_year.startswith(b):
brand = b.title()
model = t_no_year[len(b):].strip() or None
return brand, model
# 3) fallback: split words
parts = t_no_year.split()
brand = parts[0].title() if parts else None
model = " ".join(parts[1:3]) if len(parts) >= 2 else None
return brand, model
def clean_text(v):
if not isinstance(v, str):
return v
return " ".join(v.split())
def parse_frame_material(title):
for mat in ("carbon","aluminum","steel","titanium","scandium"):
if mat in title.lower(): return mat
return None
def parse_wheel_size(title):
for size in ("29","27.5","27","26"):
if size in title: return size
return None
def parse_drivetrain(title):
m = re.search(r"(SLX|XT|XTR|Deore|GX|XX1)[\w\s\-]*", title, re.I)
return m.group().strip() if m else None
def parse_brake_type(title):
if "hydraulic" in title.lower(): return "hydraulic disc"
if "disc" in title.lower(): return "mechanical disc"
return None
def parse_suspension(title):
for fork in ("Fox","RockShox","Marzocchi"):
if fork.lower() in title.lower(): return fork
return None
def parse_tires(title):
for t in ("minion","maxxis","schwalbe","continental","michelin"):
if t in title.lower(): return t.title()
return None
def parse_size(title):
m = re.search(r"\b(XS|S|M|L|XL|XXL)\b", title, re.I)
return m.group().upper() if m else None
def parse_travel(title):
m = re.search(r"(\d{2,3})\s?mm", title.lower())
return f"{m.group(1)}mm" if m else None
def parse_year(title):
m = re.search(r"\b(19|20)\d{2}\b", title)
return int(m.group()) if m else None
# ——— Load HTML ———
with open(INPUT_FILE, encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# ——— Connect to DB (unless dry-run) ———
if not DRY_RUN:
conn = psycopg2.connect(DB_DSN)
cur = conn.cursor()
# ——— Extract, clean, display or insert ———
for a in soup.find_all("a", href=re.compile(r"/marketplace/item/")):
href = a["href"]
match_id = re.search(r"/item/(\d+)", href)
listing_id = clean_text(match_id.group(1)) if match_id else None
# each listing has three info-blocks: price, title, location
blocks = a.select("div.x1gslohp")
price_txt = (
blocks[0].select_one("span[dir='auto']").get_text(strip=True)
if len(blocks) > 0 else ""
)
title = (
blocks[1].select_one("span[dir='auto']").get_text(strip=True)
if len(blocks) > 1 else None
)
location = (
blocks[2].select_one("span[dir='auto']").get_text(strip=True)
if len(blocks) > 2 else None
)
# parse numeric price
try:
price = float(price_txt.replace("$","").replace(",",""))
except:
price = None
if not title:
print(f"Skipping listing {listing_id} due to missing title")
continue
# parse brand/model
brand, model = parse_brand_model(title)
record = {
"listing_id": listing_id,
"city": CITY,
"price": price,
"currency": "USD",
"title": clean_text(title),
"url": href,
"location": clean_text(location),
"year": parse_year(title),
"brand": brand,
"model": model,
"frame_material": parse_frame_material(title),
"wheel_size": parse_wheel_size(title),
"drivetrain": parse_drivetrain(title),
"brake_type": parse_brake_type(title),
"suspension": parse_suspension(title),
"tire_brand": parse_tires(title),
"frame_size": parse_size(title),
"travel": parse_travel(title),
"raw_data": json.dumps(str(a)),
"source": SOURCE,
}
if DRY_RUN:
# print(json.dumps(record, indent=2))
print("Title:", record["title"])
print("Price:", record["price"])
print("Location:", record["location"])
print("Brand:", record["brand"])
print("Model:", record["model"])
else:
cur.execute("""
INSERT INTO bike_listings (
listing_id, city, price, currency,
title, url, location,
year, brand, model, frame_material, wheel_size,
drivetrain, brake_type, suspension,
tire_brand, frame_size, travel,
raw_data, source
) VALUES (
%(listing_id)s, %(city)s, %(price)s, %(currency)s,
%(title)s, %(url)s, %(location)s,
%(year)s, %(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s,
%(drivetrain)s, %(brake_type)s, %(suspension)s,
%(tire_brand)s, %(frame_size)s, %(travel)s,
%(raw_data)s, %(source)s
) ON CONFLICT (listing_id) DO NOTHING;
""", record)
# ——— Commit & close if not dry-run ———
if not DRY_RUN:
conn.commit()
cur.close()
conn.close()
print("Dry run complete" if DRY_RUN else f"Imported to DB from {INPUT_FILE}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment