Skip to content

Instantly share code, notes, and snippets.

@misopog
Created March 27, 2025 17:46
Show Gist options
  • Save misopog/d71309c8c03c4ac3fbafbc3cd61b4d29 to your computer and use it in GitHub Desktop.
Save misopog/d71309c8c03c4ac3fbafbc3cd61b4d29 to your computer and use it in GitHub Desktop.
csfd.cz review scraper
import argparse, csv, requests
from lxml import html
parser = argparse.ArgumentParser()
parser.add_argument("user_id")
parser.add_argument("-o", "--output", default="ratings.csv")
args = parser.parse_args()
url, page, results = f"https://www.csfd.cz/uzivatel/{args.user_id}/hodnoceni/", 1, []
print(f"scraping ratings for user: {args.user_id}")
while True:
response = requests.get(f"{url}?page={page}" if page > 1 else url, headers={"User-Agent": "mozilla/5.0"})
tree = html.fromstring(response.content)
table = tree.xpath("//table[contains(@class, 'striped')]")
if not table:
break
rows = table[0].xpath(".//tbody/tr")
for row in rows:
name = (row.xpath("./td[1]//a[@class='film-title-name']/text()") or ["unknown"])[0].strip()
rating_class = (row.xpath(".//span[contains(@class, 'stars-')]") or [{}])[0].get('class', '')
rating = rating_class.split('stars-')[-1].split()[0] if 'stars-' in rating_class else "0"
date = (row.xpath("./td[contains(@class, 'date-only')]/text()") or ["unknown"])[0].strip()
results.append((name, rating, date))
print(f"found {len(rows)} reviews on page {page}")
if not tree.xpath("//a[@class='page-next']"):
break
page += 1
print(f"total reviews found: {len(results)}")
with open(args.output, 'w', newline='', encoding='utf-8-sig') as file:
writer = csv.writer(file)
writer.writerow(['movie', 'rating', 'date'])
writer.writerows(results)
print(f"output written to {args.output}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment