Skip to content

Instantly share code, notes, and snippets.

@NaPs
Created December 29, 2008 11:34
Show Gist options
  • Select an option

  • Save NaPs/41243 to your computer and use it in GitHub Desktop.

Select an option

Save NaPs/41243 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
#coding=utf8
import urllib2
import re
MOVIE_SEARCH_URL = 'http://www.allocine.fr/recherche/?motcle=__SEARCH__&rub=1'
MOVIE_DETAIL_URL = 'http://www.allocine.fr/film/fichefilm_gen_cfilm=__ID__.html'
SHOW_SEARCH_URL = 'http://www.allocine.fr/recherche/?motcle=__SEARCH__&rub=6'
SHOW_DETAIL_URL = 'http://www.allocine.fr/series/ficheserie_gen_cserie=__ID__.html'
REGEX_LINKTOFILM = re.compile(r'(?i)<a href="\/film\/fichefilm_gen_cfilm=(\d+).html" class="link(\d+)">(.*?)<\/a>')
REGEX_LINKTOSERIE = re.compile(r'(?i)<a href="\/series\/ficheserie_gen_cserie=(\d+).html" class="link(\d+)">(.*?)<\/a>')
class Allocine:
''''''
@staticmethod
def find_movie(search, debug=False):
''''''
search = search.replace(' ', '+')
str = urllib2.urlopen(MOVIE_SEARCH_URL.replace('__SEARCH__', search)).read()
data = str.decode('latin1')
films = {}
for id, klass, name in REGEX_LINKTOFILM.findall(data):
name = re.sub(r'<(.+?)>', '', name).strip()
films[id] = name
return films
@staticmethod
def find_show(search, debug=False):
''''''
search = search.replace(' ', '+')
str = urllib2.urlopen(SHOW_SEARCH_URL.replace('__SEARCH__', search)).read()
data = str.decode('latin1')
films = {}
for id, klass, name in REGEX_LINKTOSERIE.findall(data):
name = re.sub(r'<(.+?)>', '', name).strip()
films[id] = name
return films
class AllocineMovie:
''''''
REGEXPS = {
'title': re.compile(r'(?m)<title>(.*?)<\/title>'),
'directors': re.compile(r'(?m)<h4>R.alis. par <a .*?>(.*?)<\/a><\/h4>'),
'nat': re.compile(r'(?m)<h4>Film (.*?).&nbsp;</h4>'),
'genres': re.compile(r'(?m)<h4>Genre : (.*?)</h4>'),
'out_date': re.compile(r'(?m)<h4>Date de sortie : <b>(.*?)</b>'),
'duree': re.compile(r'(?m)<h4>Dur.e : (.*?).&nbsp;</h4>'),
'production_date': re.compile(r'(?m)<h4>Ann.e de production : (.*?)</h4>'),
'original_title': re.compile(r'(?m)<h4>Titre original : <i>(.*?)</i></h4>'),
'actors': re.compile(r'(?m)<h4>Avec (.*?) &nbsp;&nbsp;'),
'synopsis': re.compile(r'(?m)<td valign="top" style="padding:10 0 0 0"><div align="justify"><h4>(.*?)</h4>'),
'image': re.compile(r'(?m)<td valign="top" width="120".*?img src="(.*?)" border="0" alt="" class="affichette" />'),
'interdit': re.compile(r'(?m)<h4 style="color: #D20000;">Interdit(.*?)</h4>'),
}
def __init__(self, id, debug=False):
if debug: print 'Getting %s' % MOVIE_DETAIL_URL.replace('__ID__', id)
str = urllib2.urlopen(MOVIE_DETAIL_URL.replace('__ID__', id)).read()
data = str.decode('latin1')
for regex_name, regex in self.REGEXPS.items():
if debug: print '%s: ' % regex_name,
r = regex.search(data)
if r:
r = re.sub(r'<.*?>', '', r.groups()[0]).strip()
setattr(self, regex_name, r)
if debug: print r
class AllocineShow:
''''''
REGEXPS = {
'title': re.compile(r'(?m)<title>(.*?)<\/title>'),
'producters': re.compile(r'(?m)<h4>Producteurs : (.*?)</h4>'),
'created_by': re.compile(r'(?m)<h4>Série créée par <a .*?>(.*?)</a>'),
'nat': re.compile(r'(?m)<span style=\'font-weight:bold\'>Nationalit.</span> : (.*?)</h5>'),
'genres': re.compile(r'(?m)<span style=\'font-weight:bold\'>Genre</span> : (.*?)&nbsp;&nbsp;'),
'duree': re.compile(r'(?m)<span style=\'font-weight:bold\'>Format</span> : (.+?).&nbsp;'),
'original_title': re.compile(r'(?m)<h4><b>Titre original : </b></h4><h4 style="color:#D20000"><b>(.*?)</b></h4>'),
'actors': re.compile(r'(?m)<h4>Avec : (.*?)&nbsp;&nbsp;'),
'synopsis': re.compile(r'(?m)<h5><span style=\'font-weight:bold\'>Synopsis</span>&nbsp;&nbsp;&nbsp;.*?<br />(.*?)</h5>'),
'image': re.compile(r'(?m)<td><div id=\'divM\' .*?><img src=\'(.*?)\' style=\'border:1px solid black;.*?>'),
}
def __init__(self, id, debug=False):
if debug: print 'Getting %s' % SHOW_DETAIL_URL.replace('__ID__', id)
str = urllib2.urlopen(SHOW_DETAIL_URL.replace('__ID__', id)).read()
data = str.decode('latin1')
for regex_name, regex in self.REGEXPS.items():
if debug: print '%s: ' % regex_name,
r = regex.search(data)
if r:
r = re.sub(r'<.*?>', '', r.groups()[0]).strip()
setattr(self, regex_name, r)
if debug: print r
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment