Created
March 27, 2025 17:46
-
-
Save misopog/d71309c8c03c4ac3fbafbc3cd61b4d29 to your computer and use it in GitHub Desktop.
csfd.cz review scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse, csv, requests | |
from lxml import html | |
parser = argparse.ArgumentParser() | |
parser.add_argument("user_id") | |
parser.add_argument("-o", "--output", default="ratings.csv") | |
args = parser.parse_args() | |
url, page, results = f"https://www.csfd.cz/uzivatel/{args.user_id}/hodnoceni/", 1, [] | |
print(f"scraping ratings for user: {args.user_id}") | |
while True: | |
response = requests.get(f"{url}?page={page}" if page > 1 else url, headers={"User-Agent": "mozilla/5.0"}) | |
tree = html.fromstring(response.content) | |
table = tree.xpath("//table[contains(@class, 'striped')]") | |
if not table: | |
break | |
rows = table[0].xpath(".//tbody/tr") | |
for row in rows: | |
name = (row.xpath("./td[1]//a[@class='film-title-name']/text()") or ["unknown"])[0].strip() | |
rating_class = (row.xpath(".//span[contains(@class, 'stars-')]") or [{}])[0].get('class', '') | |
rating = rating_class.split('stars-')[-1].split()[0] if 'stars-' in rating_class else "0" | |
date = (row.xpath("./td[contains(@class, 'date-only')]/text()") or ["unknown"])[0].strip() | |
results.append((name, rating, date)) | |
print(f"found {len(rows)} reviews on page {page}") | |
if not tree.xpath("//a[@class='page-next']"): | |
break | |
page += 1 | |
print(f"total reviews found: {len(results)}") | |
with open(args.output, 'w', newline='', encoding='utf-8-sig') as file: | |
writer = csv.writer(file) | |
writer.writerow(['movie', 'rating', 'date']) | |
writer.writerows(results) | |
print(f"output written to {args.output}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment