misopog · March 27, 2025 17:46
diff --git a/csfdscraper.py b/csfdscraper.py
 import argparse, csv, requests
 from lxml import html

 parser = argparse.ArgumentParser()
 parser.add_argument("user_id")
 parser.add_argument("-o", "--output", default="ratings.csv")
 args = parser.parse_args()

 url, page, results = f"https://www.csfd.cz/uzivatel/{args.user_id}/hodnoceni/", 1, []
 print(f"scraping ratings for user: {args.user_id}")
 while True:
    response = requests.get(f"{url}?page={page}" if page > 1 else url, headers={"User-Agent": "mozilla/5.0"})
    tree = html.fromstring(response.content)
    table = tree.xpath("//table[contains(@class, 'striped')]")
    if not table:
        break
    
    rows = table[0].xpath(".//tbody/tr")
    for row in rows:
        name = (row.xpath("./td[1]//a[@class='film-title-name']/text()") or ["unknown"])[0].strip()
        rating_class = (row.xpath(".//span[contains(@class, 'stars-')]") or [{}])[0].get('class', '')
        rating = rating_class.split('stars-')[-1].split()[0] if 'stars-' in rating_class else "0"
        date = (row.xpath("./td[contains(@class, 'date-only')]/text()") or ["unknown"])[0].strip()
        results.append((name, rating, date))
    
    print(f"found {len(rows)} reviews on page {page}")
    if not tree.xpath("//a[@class='page-next']"):
        break
    page += 1

 print(f"total reviews found: {len(results)}")
 with open(args.output, 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerow(['movie', 'rating', 'date'])
    writer.writerows(results)
 print(f"output written to {args.output}")
	import argparse, csv, requests
	from lxml import html

	parser = argparse.ArgumentParser()
	parser.add_argument("user_id")
	parser.add_argument("-o", "--output", default="ratings.csv")
	args = parser.parse_args()

	url, page, results = f"https://www.csfd.cz/uzivatel/{args.user_id}/hodnoceni/", 1, []
	print(f"scraping ratings for user: {args.user_id}")
	while True:
	response = requests.get(f"{url}?page={page}" if page > 1 else url, headers={"User-Agent": "mozilla/5.0"})
	tree = html.fromstring(response.content)
	table = tree.xpath("//table[contains(@class, 'striped')]")
	if not table:
	break

	rows = table[0].xpath(".//tbody/tr")
	for row in rows:
	name = (row.xpath("./td[1]//a[@class='film-title-name']/text()") or ["unknown"])[0].strip()
	rating_class = (row.xpath(".//span[contains(@class, 'stars-')]") or [{}])[0].get('class', '')
	rating = rating_class.split('stars-')[-1].split()[0] if 'stars-' in rating_class else "0"
	date = (row.xpath("./td[contains(@class, 'date-only')]/text()") or ["unknown"])[0].strip()
	results.append((name, rating, date))

	print(f"found {len(rows)} reviews on page {page}")
	if not tree.xpath("//a[@class='page-next']"):
	break
	page += 1

	print(f"total reviews found: {len(results)}")
	with open(args.output, 'w', newline='', encoding='utf-8-sig') as file:
	writer = csv.writer(file)
	writer.writerow(['movie', 'rating', 'date'])
	writer.writerows(results)
	print(f"output written to {args.output}")