Skip to content

Instantly share code, notes, and snippets.

@chesster
Created January 3, 2014 22:27
Show Gist options
  • Save chesster/8247882 to your computer and use it in GitHub Desktop.
Save chesster/8247882 to your computer and use it in GitHub Desktop.
[PL] Pobieranie tytulow filmów z Filmweb.pl
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
import sys
import os
import re
import unicodecsv as csv
from BeautifulSoup import BeautifulSoup
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
class FilmwebTitile(object):
def __init__(self, start=1):
self.url = "http://www.filmweb.pl/Film?id=%s"
self.start = start
self.stop = 700000
def get_info(self, id):
try:
soup = BeautifulSoup(urllib.urlopen(self.url % id).read())
titles = soup.find('div', attrs = {'class': 'filmTitle'})
# Title
try:
title = titles.h1.text
except Exception:
title = None
# Original Title
try:
original_title = titles.h2.text
except Exception:
original_title = None
# Year
try:
year = titles.span.text.strip()[1:-1]
except Exception:
year = ''
# Country
try:
countries = soup.findAll('a', href = re.compile(r'.*countryIds*'))
countries = '|'.join([e.text for e in countries])
except Exception:
country = None
if title and original_title and year and countries:
return (title, original_title, year, countries,)
return None
except Exception, e:
print("[ERROR]: %s" % e)
return None
def go(self, db_file):
with open(db_file, "a") as myfile:
for id in range(self.start, self.stop+1):
csv_list = self.get_info(id)
if csv_list:
print "[%s] %s" % (id, csv_list[0])
myfile.write(self.csv_row(csv_list))
def csv_row(self, row):
csvfile = StringIO()
csvwriter = csv.writer(csvfile)
csvwriter.writerow(row)
return csvfile.getvalue()
if __name__ == '__main__':
filmweb = FilmwebTitile()
filmweb.go('filmweb.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment