tbbooher · May 27, 2025 01:20
diff --git a/facebook_parser.py b/facebook_parser.py
 #!/usr/bin/env python3
 import re
 import json
 import psycopg2
 from bs4 import BeautifulSoup

 # ——— CONFIG ———
 DB_DSN     = "host=localhost port=5432 user=tim dbname=bike_prices"
 CITY       = "Fort Worth"
 INPUT_FILE = "fortworth.html"
 DRY_RUN    = False
 SOURCE     = "facebook"

 # ——— KNOWN BRANDS ———
 # order matters: check "santa cruz" before "santa" or "cruz"
 KNOWN_BRANDS = [
    "santa cruz", "specialized", "trek", "yt industries", "yeti", "cannondale",
    "norco", "pivot", "canyon", "giant", "scott", "salsa", "commencal", "kona",
    "juliana", "polygon", "marin", "bmc", "revel", "privateer", "devinci",
    "allied", "alchemy", "orbea", "transition", "fuji", "felt", "niner", "ibis"
 ]

 # ——— Helpers to extract specs ———
 year_rx = re.compile(r"^(19|20)\d{2}\b")
 def strip_leading_year(s):
    return re.sub(r"^(19|20)\d{2}\s+", "", s)

 def parse_brand_model(title):
    t = title.lower().strip()
    # 1) drop a leading year
    t_no_year = strip_leading_year(t)
    # 2) try to match a known brand
    for b in KNOWN_BRANDS:
        if t_no_year.startswith(b):
            brand = b.title()
            model = t_no_year[len(b):].strip() or None
            return brand, model
    # 3) fallback: split words
    parts = t_no_year.split()
    brand = parts[0].title() if parts else None
    model = " ".join(parts[1:3]) if len(parts) >= 2 else None
    return brand, model

 def clean_text(v):
    if not isinstance(v, str):
        return v
    return " ".join(v.split())

 def parse_frame_material(title):
    for mat in ("carbon","aluminum","steel","titanium","scandium"):
        if mat in title.lower(): return mat
    return None

 def parse_wheel_size(title):
    for size in ("29","27.5","27","26"):
        if size in title: return size
    return None

 def parse_drivetrain(title):
    m = re.search(r"(SLX|XT|XTR|Deore|GX|XX1)[\w\s\-]*", title, re.I)
    return m.group().strip() if m else None

 def parse_brake_type(title):
    if "hydraulic" in title.lower(): return "hydraulic disc"
    if "disc"     in title.lower(): return "mechanical disc"
    return None

 def parse_suspension(title):
    for fork in ("Fox","RockShox","Marzocchi"):
        if fork.lower() in title.lower(): return fork
    return None

 def parse_tires(title):
    for t in ("minion","maxxis","schwalbe","continental","michelin"):
        if t in title.lower(): return t.title()
    return None

 def parse_size(title):
    m = re.search(r"\b(XS|S|M|L|XL|XXL)\b", title, re.I)
    return m.group().upper() if m else None

 def parse_travel(title):
    m = re.search(r"(\d{2,3})\s?mm", title.lower())
    return f"{m.group(1)}mm" if m else None

 def parse_year(title):
    m = re.search(r"\b(19|20)\d{2}\b", title)
    return int(m.group()) if m else None

 # ——— Load HTML ———
 with open(INPUT_FILE, encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

 # ——— Connect to DB (unless dry-run) ———
 if not DRY_RUN:
    conn = psycopg2.connect(DB_DSN)
    cur  = conn.cursor()

 # ——— Extract, clean, display or insert ———
 for a in soup.find_all("a", href=re.compile(r"/marketplace/item/")):
    href       = a["href"]
    match_id   = re.search(r"/item/(\d+)", href)
    listing_id = clean_text(match_id.group(1)) if match_id else None

    # each listing has three info-blocks: price, title, location
    blocks     = a.select("div.x1gslohp")
    price_txt  = (
        blocks[0].select_one("span[dir='auto']").get_text(strip=True)
        if len(blocks) > 0 else ""
    )
    title      = (
        blocks[1].select_one("span[dir='auto']").get_text(strip=True)
        if len(blocks) > 1 else None
    )
    location   = (
        blocks[2].select_one("span[dir='auto']").get_text(strip=True)
        if len(blocks) > 2 else None
    )

    # parse numeric price
    try:
        price = float(price_txt.replace("$","").replace(",",""))
    except:
        price = None

    if not title:
        print(f"Skipping listing {listing_id} due to missing title")
        continue

    # parse brand/model
    brand, model = parse_brand_model(title)

    record = {
        "listing_id":     listing_id,
        "city":           CITY,
        "price":          price,
        "currency":       "USD",
        "title":          clean_text(title),
        "url":            href,
        "location":       clean_text(location),
        "year":           parse_year(title),
        "brand":          brand,
        "model":          model,
        "frame_material": parse_frame_material(title),
        "wheel_size":     parse_wheel_size(title),
        "drivetrain":     parse_drivetrain(title),
        "brake_type":     parse_brake_type(title),
        "suspension":     parse_suspension(title),
        "tire_brand":     parse_tires(title),
        "frame_size":     parse_size(title),
        "travel":         parse_travel(title),
        "raw_data":       json.dumps(str(a)),
        "source":         SOURCE,
    }

    if DRY_RUN:
       # print(json.dumps(record, indent=2))
        print("Title:", record["title"])
        print("Price:", record["price"])
        print("Location:", record["location"])
        print("Brand:", record["brand"])
        print("Model:", record["model"])
    else:
        cur.execute("""
            INSERT INTO bike_listings (
              listing_id, city, price, currency,
              title, url, location,
              year, brand, model, frame_material, wheel_size,
              drivetrain, brake_type, suspension,
              tire_brand, frame_size, travel,
              raw_data, source
            ) VALUES (
              %(listing_id)s, %(city)s, %(price)s, %(currency)s,
              %(title)s, %(url)s, %(location)s,
              %(year)s, %(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s,
              %(drivetrain)s, %(brake_type)s, %(suspension)s,
              %(tire_brand)s, %(frame_size)s, %(travel)s,
              %(raw_data)s, %(source)s
            ) ON CONFLICT (listing_id) DO NOTHING;
        """, record)

 # ——— Commit & close if not dry-run ———
 if not DRY_RUN:
    conn.commit()
    cur.close()
    conn.close()

 print("Dry run complete" if DRY_RUN else f"Imported to DB from {INPUT_FILE}")
	#!/usr/bin/env python3
	import re
	import json
	import psycopg2
	from bs4 import BeautifulSoup

	# ——— CONFIG ———
	DB_DSN = "host=localhost port=5432 user=tim dbname=bike_prices"
	CITY = "Fort Worth"
	INPUT_FILE = "fortworth.html"
	DRY_RUN = False
	SOURCE = "facebook"

	# ——— KNOWN BRANDS ———
	# order matters: check "santa cruz" before "santa" or "cruz"
	KNOWN_BRANDS = [
	"santa cruz", "specialized", "trek", "yt industries", "yeti", "cannondale",
	"norco", "pivot", "canyon", "giant", "scott", "salsa", "commencal", "kona",
	"juliana", "polygon", "marin", "bmc", "revel", "privateer", "devinci",
	"allied", "alchemy", "orbea", "transition", "fuji", "felt", "niner", "ibis"
	]

	# ——— Helpers to extract specs ———
	year_rx = re.compile(r"^(19\|20)\d{2}\b")
	def strip_leading_year(s):
	return re.sub(r"^(19\|20)\d{2}\s+", "", s)

	def parse_brand_model(title):
	t = title.lower().strip()
	# 1) drop a leading year
	t_no_year = strip_leading_year(t)
	# 2) try to match a known brand
	for b in KNOWN_BRANDS:
	if t_no_year.startswith(b):
	brand = b.title()
	model = t_no_year[len(b):].strip() or None
	return brand, model
	# 3) fallback: split words
	parts = t_no_year.split()
	brand = parts[0].title() if parts else None
	model = " ".join(parts[1:3]) if len(parts) >= 2 else None
	return brand, model

	def clean_text(v):
	if not isinstance(v, str):
	return v
	return " ".join(v.split())

	def parse_frame_material(title):
	for mat in ("carbon","aluminum","steel","titanium","scandium"):
	if mat in title.lower(): return mat
	return None

	def parse_wheel_size(title):
	for size in ("29","27.5","27","26"):
	if size in title: return size
	return None

	def parse_drivetrain(title):
	m = re.search(r"(SLX\|XT\|XTR\|Deore\|GX\|XX1)[\w\s\-]*", title, re.I)
	return m.group().strip() if m else None

	def parse_brake_type(title):
	if "hydraulic" in title.lower(): return "hydraulic disc"
	if "disc" in title.lower(): return "mechanical disc"
	return None

	def parse_suspension(title):
	for fork in ("Fox","RockShox","Marzocchi"):
	if fork.lower() in title.lower(): return fork
	return None

	def parse_tires(title):
	for t in ("minion","maxxis","schwalbe","continental","michelin"):
	if t in title.lower(): return t.title()
	return None

	def parse_size(title):
	m = re.search(r"\b(XS\|S\|M\|L\|XL\|XXL)\b", title, re.I)
	return m.group().upper() if m else None

	def parse_travel(title):
	m = re.search(r"(\d{2,3})\s?mm", title.lower())
	return f"{m.group(1)}mm" if m else None

	def parse_year(title):
	m = re.search(r"\b(19\|20)\d{2}\b", title)
	return int(m.group()) if m else None

	# ——— Load HTML ———
	with open(INPUT_FILE, encoding="utf-8") as f:
	soup = BeautifulSoup(f, "html.parser")

	# ——— Connect to DB (unless dry-run) ———
	if not DRY_RUN:
	conn = psycopg2.connect(DB_DSN)
	cur = conn.cursor()

	# ——— Extract, clean, display or insert ———
	for a in soup.find_all("a", href=re.compile(r"/marketplace/item/")):
	href = a["href"]
	match_id = re.search(r"/item/(\d+)", href)
	listing_id = clean_text(match_id.group(1)) if match_id else None

	# each listing has three info-blocks: price, title, location
	blocks = a.select("div.x1gslohp")
	price_txt = (
	blocks[0].select_one("span[dir='auto']").get_text(strip=True)
	if len(blocks) > 0 else ""
	)
	title = (
	blocks[1].select_one("span[dir='auto']").get_text(strip=True)
	if len(blocks) > 1 else None
	)
	location = (
	blocks[2].select_one("span[dir='auto']").get_text(strip=True)
	if len(blocks) > 2 else None
	)

	# parse numeric price
	try:
	price = float(price_txt.replace("$","").replace(",",""))
	except:
	price = None

	if not title:
	print(f"Skipping listing {listing_id} due to missing title")
	continue

	# parse brand/model
	brand, model = parse_brand_model(title)

	record = {
	"listing_id": listing_id,
	"city": CITY,
	"price": price,
	"currency": "USD",
	"title": clean_text(title),
	"url": href,
	"location": clean_text(location),
	"year": parse_year(title),
	"brand": brand,
	"model": model,
	"frame_material": parse_frame_material(title),
	"wheel_size": parse_wheel_size(title),
	"drivetrain": parse_drivetrain(title),
	"brake_type": parse_brake_type(title),
	"suspension": parse_suspension(title),
	"tire_brand": parse_tires(title),
	"frame_size": parse_size(title),
	"travel": parse_travel(title),
	"raw_data": json.dumps(str(a)),
	"source": SOURCE,
	}

	if DRY_RUN:
	# print(json.dumps(record, indent=2))
	print("Title:", record["title"])
	print("Price:", record["price"])
	print("Location:", record["location"])
	print("Brand:", record["brand"])
	print("Model:", record["model"])
	else:
	cur.execute("""
	INSERT INTO bike_listings (
	listing_id, city, price, currency,
	title, url, location,
	year, brand, model, frame_material, wheel_size,
	drivetrain, brake_type, suspension,
	tire_brand, frame_size, travel,
	raw_data, source
	) VALUES (
	%(listing_id)s, %(city)s, %(price)s, %(currency)s,
	%(title)s, %(url)s, %(location)s,
	%(year)s, %(brand)s, %(model)s, %(frame_material)s, %(wheel_size)s,
	%(drivetrain)s, %(brake_type)s, %(suspension)s,
	%(tire_brand)s, %(frame_size)s, %(travel)s,
	%(raw_data)s, %(source)s
	) ON CONFLICT (listing_id) DO NOTHING;
	""", record)

	# ——— Commit & close if not dry-run ———
	if not DRY_RUN:
	conn.commit()
	cur.close()
	conn.close()

	print("Dry run complete" if DRY_RUN else f"Imported to DB from {INPUT_FILE}")