iandundas · February 21, 2014 13:24 · iandundas · Feb 21, 2014
diff --git a/film_list b/film_list
 The East (2013)
 Shutter island
 Stephen Fry: The Secret Life of the Manic Depressive
 The Departed
 Alan Partridge: Alpha Papa
 Only God Forgives
 You've got mail
 The Draughtsman's Contract
 The Deal
 Winter's Bone
 A World Apart (1988)
 We need to talk about Kevin
diff --git a/Gemfile b/Gemfile
 source 'https://rubygems.org'
 gem 'nokogiri'
diff --git a/IMDB.rb b/IMDB.rb
 require 'nokogiri'
 require 'open-uri'

 @films=[]
 @not_found=[]

 def get_film_path(film_name)
 	film_search_url= URI.escape('http://www.imdb.com/find?q='+film_name+'&s=all')
 	doc = Nokogiri::HTML(open(film_search_url))

 	results= doc.xpath('//*[@id="main"]/div/div[2]/table/tr/td[2]/a')

 	if results.first
 		first_result_content= results.first.content()
 		# strip out (1999) and check if it matches
 		match1= first_result_content.casecmp(film_name.sub(/\(\d+\)/,'').strip) == 0
 		# add 'The ' and check if it matches
 		match2= first_result_content.casecmp("The #{film_name}".sub(/\(\d+\)/,'').strip) == 0

 		return results.first.attr('href') if match1||match2
 	end
 end
 def get_film_meta(path)
 	film_full_url= URI.escape('http://www.imdb.com'+path)
 	doc = Nokogiri::HTML(open(film_full_url))

 	score = doc.xpath('//*[@id="overview-top"]/div[3]/div[1]').first
 	name= doc.xpath('//*[@id="overview-top"]/h1/span[1]').first.content
 	genres = doc.xpath('//span[@itemprop="genre"]')
 	
 	return {
 		:score => score ? score.content : nil, 
 		:name => name,
 		:genres => genres
 	}
 end

 def find_alternatives(film_name)
 	film_search_url= URI.escape('http://www.imdb.com/find?q='+film_name+'&s=all')
 	doc = Nokogiri::HTML(open(film_search_url))

 	alternatives=[]
 	doc.xpath('//*[@id="main"]/div/div[2]/table/tr/td[2]/a').each do |e| 
 		alternatives.push e.content
 	end
 	return alternatives
 end

 def film_not_found(film_name)
 	alternatives= find_alternatives(film_name)
 	p
 	p " Didn't find '#{film_name}', did you mean: '"
 	alternatives.each {|film| p "    #{film}"}
 	p
 	@not_found.push film_name
 end

 File.readlines('film_list').each do |raw_film_name|
 	raw_film_name.strip!
 	path = get_film_path(raw_film_name)
 	if (!path.nil? && path.include?('/title/')) #if the path is for a film title..

 		meta= get_film_meta(path)
 		score= meta[:score]
 		film_name= meta[:name]

 		p "#{film_name}: #{score}"

 		if score
 			@films.push meta
 		else
 			film_not_found raw_film_name
 		end
 	else
 		film_not_found raw_film_name
 	end
 end


 puts 
 puts 
 puts "<<<< RESULTS >>>> "

 @films = @films.sort! {|x, y| x[:score] <=> y[:score] }
 @films.reverse.each do |film| 

 	genres_string= film[:genres].nil? ? '' : film[:genres].collect {|e| "#"+e}.join(" ")
 	p "#{film[:name]}: #{film[:score]} #{genres_string}"
 end

 puts 
 puts
 puts "=== Not Found... ==="
 @not_found.each {|film| p film}

 puts
 puts
 puts "=== FINISHED === "
	The East (2013)
	Shutter island
	Stephen Fry: The Secret Life of the Manic Depressive
	The Departed
	Alan Partridge: Alpha Papa
	Only God Forgives
	You've got mail
	The Draughtsman's Contract
	The Deal
	Winter's Bone
	A World Apart (1988)
	We need to talk about Kevin
	require 'nokogiri'
	require 'open-uri'

	@films=[]
	@not_found=[]

	def get_film_path(film_name)
	film_search_url= URI.escape('http://www.imdb.com/find?q='+film_name+'&s=all')
	doc = Nokogiri::HTML(open(film_search_url))

	results= doc.xpath('//*[@id="main"]/div/div[2]/table/tr/td[2]/a')

	if results.first
	first_result_content= results.first.content()
	# strip out (1999) and check if it matches
	match1= first_result_content.casecmp(film_name.sub(/\(\d+\)/,'').strip) == 0
	# add 'The ' and check if it matches
	match2= first_result_content.casecmp("The #{film_name}".sub(/\(\d+\)/,'').strip) == 0

	return results.first.attr('href') if match1\|\|match2
	end
	end
	def get_film_meta(path)
	film_full_url= URI.escape('http://www.imdb.com'+path)
	doc = Nokogiri::HTML(open(film_full_url))

	score = doc.xpath('//*[@id="overview-top"]/div[3]/div[1]').first
	name= doc.xpath('//*[@id="overview-top"]/h1/span[1]').first.content
	genres = doc.xpath('//span[@itemprop="genre"]')

	return {
	:score => score ? score.content : nil,
	:name => name,
	:genres => genres
	}
	end

	def find_alternatives(film_name)
	film_search_url= URI.escape('http://www.imdb.com/find?q='+film_name+'&s=all')
	doc = Nokogiri::HTML(open(film_search_url))

	alternatives=[]
	doc.xpath('//*[@id="main"]/div/div[2]/table/tr/td[2]/a').each do \|e\|
	alternatives.push e.content
	end
	return alternatives
	end

	def film_not_found(film_name)
	alternatives= find_alternatives(film_name)
	p
	p " Didn't find '#{film_name}', did you mean: '"
	alternatives.each {\|film\| p " #{film}"}
	p
	@not_found.push film_name
	end

	File.readlines('film_list').each do \|raw_film_name\|
	raw_film_name.strip!
	path = get_film_path(raw_film_name)
	if (!path.nil? && path.include?('/title/')) #if the path is for a film title..

	meta= get_film_meta(path)
	score= meta[:score]
	film_name= meta[:name]

	p "#{film_name}: #{score}"

	if score
	@films.push meta
	else
	film_not_found raw_film_name
	end
	else
	film_not_found raw_film_name
	end
	end


	puts
	puts
	puts "<<<< RESULTS >>>> "

	@films = @films.sort! {\|x, y\| x[:score] <=> y[:score] }
	@films.reverse.each do \|film\|

	genres_string= film[:genres].nil? ? '' : film[:genres].collect {\|e\| "#"+e}.join(" ")
	p "#{film[:name]}: #{film[:score]} #{genres_string}"
	end

	puts
	puts
	puts "=== Not Found... ==="
	@not_found.each {\|film\| p film}

	puts
	puts
	puts "=== FINISHED === "