Skip to content

Instantly share code, notes, and snippets.

@zaghaghi
Created December 8, 2012 22:23
Show Gist options
  • Save zaghaghi/4242245 to your computer and use it in GitHub Desktop.
Save zaghaghi/4242245 to your computer and use it in GitHub Desktop.
scrape iranian movies from sourehcinema.com
138109110000,آبي و رابي,1309,http://www.sourehcinema.com/Title/Title.aspx?id=138109110000
138109110001,انتقام برادر,1310,http://www.sourehcinema.com/Title/Title.aspx?id=138109110001
138109110002,حاجي آقا اكتور سينما,1312,http://www.sourehcinema.com/Title/Title.aspx?id=138109110002
138109110003,دختر لر,1312,http://www.sourehcinema.com/Title/Title.aspx?id=138109110003
138109110051,بوالهوس,1313,http://www.sourehcinema.com/Title/Title.aspx?id=138109110051
138212170000,شيرين و فرهاد,1313,http://www.sourehcinema.com/Title/Title.aspx?id=138212170000
138109110006,فردوسي,1313,http://www.sourehcinema.com/Title/Title.aspx?id=138109110006
138103110007,چشم هاي سياه,1315,http://www.sourehcinema.com/Title/Title.aspx?id=138103110007
138208262000,ليلي و مجنون,1316,http://www.sourehcinema.com/Title/Title.aspx?id=138208262000
138109110009,زنداني امير,1327,http://www.sourehcinema.com/Title/Title.aspx?id=138109110009
# -*- coding: utf8 -*-
import codecs
from bs4 import BeautifulSoup
from urllib import request
import re, os
def fa2en(fanum):
fa_en = {'۰':'0', '۱':'1', '۲':'2', '۳':'3', '۴':'4', '۵':'5', '۶':'6', '۷':'7', '۸':'8', '۹':'9'}
return ''.join([fa_en[a] for a in fanum])
def extractMoviesByYear(myear, file):
opener = request.FancyURLopener()
reader = opener.open('http://www.sourehcinema.com/Years/TitleByYear.aspx?Year='+str(myear)+'&FilmName=All')
soup = BeautifulSoup(reader)
titles = soup('a',{'href':re.compile('http://www\.sourehcinema\.com/Title/Title\.aspx\?id=\d+')})
for title in titles:
href = title['href']
id = int(href.split('=')[-1])
name = ' '.join(title.text.split()[:-2])
year = fa2en(title.text.split()[-2][1:])
output = str(id)+','+name+','+year+','+href+'\r\n'
file.write(output)
os.unlink('movies.csv')
for year in range(1309,1392):
print ("Scraping Movies ("+str(year)+")")
file = codecs.open('movies.csv', 'a', 'utf-8')
extractMoviesByYear(year,file)
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment