Created
December 8, 2012 22:23
-
-
Save zaghaghi/4242245 to your computer and use it in GitHub Desktop.
scrape iranian movies from sourehcinema.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
138109110000,آبي و رابي,1309,http://www.sourehcinema.com/Title/Title.aspx?id=138109110000 | |
138109110001,انتقام برادر,1310,http://www.sourehcinema.com/Title/Title.aspx?id=138109110001 | |
138109110002,حاجي آقا اكتور سينما,1312,http://www.sourehcinema.com/Title/Title.aspx?id=138109110002 | |
138109110003,دختر لر,1312,http://www.sourehcinema.com/Title/Title.aspx?id=138109110003 | |
138109110051,بوالهوس,1313,http://www.sourehcinema.com/Title/Title.aspx?id=138109110051 | |
138212170000,شيرين و فرهاد,1313,http://www.sourehcinema.com/Title/Title.aspx?id=138212170000 | |
138109110006,فردوسي,1313,http://www.sourehcinema.com/Title/Title.aspx?id=138109110006 | |
138103110007,چشم هاي سياه,1315,http://www.sourehcinema.com/Title/Title.aspx?id=138103110007 | |
138208262000,ليلي و مجنون,1316,http://www.sourehcinema.com/Title/Title.aspx?id=138208262000 | |
138109110009,زنداني امير,1327,http://www.sourehcinema.com/Title/Title.aspx?id=138109110009 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
import codecs | |
from bs4 import BeautifulSoup | |
from urllib import request | |
import re, os | |
def fa2en(fanum): | |
fa_en = {'۰':'0', '۱':'1', '۲':'2', '۳':'3', '۴':'4', '۵':'5', '۶':'6', '۷':'7', '۸':'8', '۹':'9'} | |
return ''.join([fa_en[a] for a in fanum]) | |
def extractMoviesByYear(myear, file): | |
opener = request.FancyURLopener() | |
reader = opener.open('http://www.sourehcinema.com/Years/TitleByYear.aspx?Year='+str(myear)+'&FilmName=All') | |
soup = BeautifulSoup(reader) | |
titles = soup('a',{'href':re.compile('http://www\.sourehcinema\.com/Title/Title\.aspx\?id=\d+')}) | |
for title in titles: | |
href = title['href'] | |
id = int(href.split('=')[-1]) | |
name = ' '.join(title.text.split()[:-2]) | |
year = fa2en(title.text.split()[-2][1:]) | |
output = str(id)+','+name+','+year+','+href+'\r\n' | |
file.write(output) | |
os.unlink('movies.csv') | |
for year in range(1309,1392): | |
print ("Scraping Movies ("+str(year)+")") | |
file = codecs.open('movies.csv', 'a', 'utf-8') | |
extractMoviesByYear(year,file) | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment