Last active
December 16, 2015 17:48
-
-
Save amardaxini/5472589 to your computer and use it in GitHub Desktop.
Write a parser for aggregating actor name and movie count they appear
in, for the top 250 IMDB movies.
(http://www.imdb.com/chart/top)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'net/http' | |
require 'nokogiri' | |
require 'pry' | |
require 'open-uri' | |
module Imdb | |
class Person | |
class << self; attr_accessor :persons end | |
@persons = [] | |
attr_accessor :name,:no_of_movies | |
def initialize(name,no_of_movies=1) | |
@name = name | |
@no_of_movies = no_of_movies | |
end | |
class << self | |
def find(name) | |
self.persons.select{|x|x.name == name} | |
end | |
def find_or_create(name) | |
person = self.persons.select{|x|x.name == name} | |
if person && person.empty? | |
self.persons.push(new(name)) | |
else | |
person[0].no_of_movies = person[0].no_of_movies+1 | |
person[0] | |
end | |
end | |
def to_print | |
File.open("/tmp/person_movies", "w") do |file| | |
file.write "Actor Name | No Of Movies \n" | |
persons.each do |person| | |
file.write "#{person.name} | #{person.no_of_movies}\n" | |
end | |
end | |
end | |
end | |
end | |
end | |
module Imdb | |
class Movie | |
attr_accessor :rank,:rating,:name,:votes,:casts,:url,:movie_doc,:persons | |
def initialize(rank,rating,name,votes,url="") | |
@rank = rank | |
@rating = rating | |
@name = name | |
@votes= votes | |
@url = url | |
@persons = [] | |
@casts = [] | |
end | |
def self.parse_movie_row(row) | |
td_data = row.css("td") | |
td_text = td_data.collect(&:text) | |
movie = new(td_text[0],td_text[1],td_text[2],td_text[3]) | |
movie.url = td_data[2].search('a[@href]')[0].attributes["href"].text() rescue "" | |
movie | |
end | |
def get_cast | |
if self.url | |
url = "http://www.imdb.com"+self.url | |
@movie_doc = Nokogiri::HTML(open(url)) | |
@movie_doc.css("#titleCast .cast_list tr").each_with_index do |cast_list,index| | |
parse_cast_lists(cast_list) if index!=0 | |
end | |
end | |
end | |
def parse_cast_lists(cast_list) | |
cast_data = cast_list.css("td") | |
begin | |
name = cast_data[1].text.gsub("\n",'').gsub("\n","").strip | |
character = cast_data[3].text.gsub("\n",'').gsub("\n","").strip | |
@casts.push({name=>character}) | |
@person.push(Imdb::Person.find_or_create(name)) | |
rescue | |
end | |
end | |
def movie_info | |
"#{rank} | #{rating} | #{name} || #{votes}" | |
end | |
def get_casts_info | |
casts_info = "Casts Info\n" | |
@casts.each do |cast| | |
begin | |
casts_info+="#{cast.keys[0]} | #{cast.values[0]}\n" | |
rescue | |
casts_info | |
end | |
end | |
casts_info | |
end | |
def to_print | |
File.open("/tmp/imdb_top_20_#{self.rank}", "w") do |file| | |
file.write movie_info | |
file.write "\n" | |
file.write get_casts_info | |
end | |
end | |
def get_basic_movie_info | |
@top_movies.collect(&:movie_info) | |
end | |
end | |
end | |
module Imdb | |
class TopMovies | |
attr_accessor :top_movies,:doc | |
def initialize(imdb_top_url ="http://www.imdb.com/chart/top" ) | |
@top_movies = [] | |
@doc = Nokogiri::HTML(open(imdb_top_url)) | |
load_movies | |
end | |
def load_movies | |
@doc.css('#main table')[1].css('tr').each_with_index do |row,index| | |
if index!=0 | |
@top_movies[index-1] = Imdb::Movie.parse_movie_row(row) | |
end | |
end | |
end | |
def get_all_casts | |
# File.open("/tmp/imdb_top_20", "w") do |file| | |
@top_movies.each do |movie| | |
movie.get_cast | |
# movie.to_print | |
end | |
# end | |
end | |
end | |
end | |
imdb_movies = Imdb::TopMovies.new | |
imdb_movies.get_all_casts | |
Imdb::Person.to_print | |
binding.pry | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment