Skip to content

Instantly share code, notes, and snippets.

@amardaxini
Last active December 16, 2015 17:48
Show Gist options
  • Save amardaxini/5472589 to your computer and use it in GitHub Desktop.
Save amardaxini/5472589 to your computer and use it in GitHub Desktop.
Write a parser for aggregating actor name and movie count they appear in, for the top 250 IMDB movies. (http://www.imdb.com/chart/top)
require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'pry'
require 'open-uri'
module Imdb
class Person
class << self; attr_accessor :persons end
@persons = []
attr_accessor :name,:no_of_movies
def initialize(name,no_of_movies=1)
@name = name
@no_of_movies = no_of_movies
end
class << self
def find(name)
self.persons.select{|x|x.name == name}
end
def find_or_create(name)
person = self.persons.select{|x|x.name == name}
if person && person.empty?
self.persons.push(new(name))
else
person[0].no_of_movies = person[0].no_of_movies+1
person[0]
end
end
def to_print
File.open("/tmp/person_movies", "w") do |file|
file.write "Actor Name | No Of Movies \n"
persons.each do |person|
file.write "#{person.name} | #{person.no_of_movies}\n"
end
end
end
end
end
end
module Imdb
class Movie
attr_accessor :rank,:rating,:name,:votes,:casts,:url,:movie_doc,:persons
def initialize(rank,rating,name,votes,url="")
@rank = rank
@rating = rating
@name = name
@votes= votes
@url = url
@persons = []
@casts = []
end
def self.parse_movie_row(row)
td_data = row.css("td")
td_text = td_data.collect(&:text)
movie = new(td_text[0],td_text[1],td_text[2],td_text[3])
movie.url = td_data[2].search('a[@href]')[0].attributes["href"].text() rescue ""
movie
end
def get_cast
if self.url
url = "http://www.imdb.com"+self.url
@movie_doc = Nokogiri::HTML(open(url))
@movie_doc.css("#titleCast .cast_list tr").each_with_index do |cast_list,index|
parse_cast_lists(cast_list) if index!=0
end
end
end
def parse_cast_lists(cast_list)
cast_data = cast_list.css("td")
begin
name = cast_data[1].text.gsub("\n",'').gsub("\n","").strip
character = cast_data[3].text.gsub("\n",'').gsub("\n","").strip
@casts.push({name=>character})
@person.push(Imdb::Person.find_or_create(name))
rescue
end
end
def movie_info
"#{rank} | #{rating} | #{name} || #{votes}"
end
def get_casts_info
casts_info = "Casts Info\n"
@casts.each do |cast|
begin
casts_info+="#{cast.keys[0]} | #{cast.values[0]}\n"
rescue
casts_info
end
end
casts_info
end
def to_print
File.open("/tmp/imdb_top_20_#{self.rank}", "w") do |file|
file.write movie_info
file.write "\n"
file.write get_casts_info
end
end
def get_basic_movie_info
@top_movies.collect(&:movie_info)
end
end
end
module Imdb
class TopMovies
attr_accessor :top_movies,:doc
def initialize(imdb_top_url ="http://www.imdb.com/chart/top" )
@top_movies = []
@doc = Nokogiri::HTML(open(imdb_top_url))
load_movies
end
def load_movies
@doc.css('#main table')[1].css('tr').each_with_index do |row,index|
if index!=0
@top_movies[index-1] = Imdb::Movie.parse_movie_row(row)
end
end
end
def get_all_casts
# File.open("/tmp/imdb_top_20", "w") do |file|
@top_movies.each do |movie|
movie.get_cast
# movie.to_print
end
# end
end
end
end
imdb_movies = Imdb::TopMovies.new
imdb_movies.get_all_casts
Imdb::Person.to_print
binding.pry
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment