rodloboz · April 25, 2018 10:02
diff --git a/scraper.rb b/scraper.rb
 require 'open-uri'
 require 'nokogiri'
 require 'pry-byebug'
 BASE_URL = "https://www.imdb.com"

 def fetch_urls
  url = "https://www.imdb.com/chart/top"

  html_file = open(url).read # string
  html_doc = Nokogiri::HTML(html_file) #Nokogiri::HTML::Document

  urls = []
  movie_list = html_doc.search('.titleColumn a').each do |element|
    suburl_with_query = element['href'] # this is a string
    match_data = suburl_with_query.match(/(.+)(\?.+)/) # matchdata object
    suburl = match_data[1] # this is a string
    urls << BASE_URL + suburl # string contatenation
  end

  urls.take(5)
 end

 def get_movie_info(url)
  return {} if url.empty?
  html_file = open(url, "Accept-Language" => "en").read
  html_doc = Nokogiri::HTML(html_file)

 url
  title_string = html_doc.search('h1').text
  match_data = title_string.match(/(?<title>.+)\((?<year>\d{4}).+[[:space:]]/)
  title = match_data[:title][0..-2]
  year = match_data[:year].to_i

  storyline = html_doc.search('.summary_text').text.strip
  director = html_doc.search('span[itemprop="director"]').text.strip
  actors = html_doc.search('span[itemprop="actors"]') # array like object
  cast = []
  actors.each do |actor|
    cast << actor.text.strip.split(',').first # string
  end

  hash = {
    title: title,
    year: year,
    storyline: storyline,
    director: director,
    cast: cast

  }
  hash
 end

 # get_movie_info('https://www.imdb.com/title/tt0111161/')
diff --git a/scraper_spec.rb b/scraper_spec.rb
 require_relative '../scraper.rb'

 describe '#fetch_urls' do
  it "should return an array of the top 5 urls" do
    actual = fetch_urls
    expected = [
      "https://www.imdb.com/title/tt0111161/",
      "https://www.imdb.com/title/tt0068646/",
      "https://www.imdb.com/title/tt0071562/",
      "https://www.imdb.com/title/tt0468569/",
      "https://www.imdb.com/title/tt0050083/"
    ]
    expect(actual).to eq(expected)
  end
 end

 describe '#get_movie_info' do
  it "should return and empty hash if passed invalid url" do
    url = ""
    actual = get_movie_info(url)
    expected = {}
    expect(actual).to eq(expected)
  end

  it "should return a hash with the correct movie info" do
    url = 'https://www.imdb.com/title/tt0111161/'
    actual = get_movie_info(url)
    expected = {
      title: "The Shawshank Redemption",
      year: 1994,
      cast: ['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'],
      storyline: 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
      director: 'Frank Darabont'
    }
    expect(actual).to eq(expected)
  end
 end
	require 'open-uri'
	require 'nokogiri'
	require 'pry-byebug'
	BASE_URL = "https://www.imdb.com"

	def fetch_urls
	url = "https://www.imdb.com/chart/top"

	html_file = open(url).read # string
	html_doc = Nokogiri::HTML(html_file) #Nokogiri::HTML::Document

	urls = []
	movie_list = html_doc.search('.titleColumn a').each do \|element\|
	suburl_with_query = element['href'] # this is a string
	match_data = suburl_with_query.match(/(.+)(\?.+)/) # matchdata object
	suburl = match_data[1] # this is a string
	urls << BASE_URL + suburl # string contatenation
	end

	urls.take(5)
	end

	def get_movie_info(url)
	return {} if url.empty?
	html_file = open(url, "Accept-Language" => "en").read
	html_doc = Nokogiri::HTML(html_file)

	url
	title_string = html_doc.search('h1').text
	match_data = title_string.match(/(?<title>.+)\((?<year>\d{4}).+[[:space:]]/)
	title = match_data[:title][0..-2]
	year = match_data[:year].to_i

	storyline = html_doc.search('.summary_text').text.strip
	director = html_doc.search('span[itemprop="director"]').text.strip
	actors = html_doc.search('span[itemprop="actors"]') # array like object
	cast = []
	actors.each do \|actor\|
	cast << actor.text.strip.split(',').first # string
	end

	hash = {
	title: title,
	year: year,
	storyline: storyline,
	director: director,
	cast: cast

	}
	hash
	end

	# get_movie_info('https://www.imdb.com/title/tt0111161/')
	require_relative '../scraper.rb'

	describe '#fetch_urls' do
	it "should return an array of the top 5 urls" do
	actual = fetch_urls
	expected = [
	"https://www.imdb.com/title/tt0111161/",
	"https://www.imdb.com/title/tt0068646/",
	"https://www.imdb.com/title/tt0071562/",
	"https://www.imdb.com/title/tt0468569/",
	"https://www.imdb.com/title/tt0050083/"
	]
	expect(actual).to eq(expected)
	end
	end

	describe '#get_movie_info' do
	it "should return and empty hash if passed invalid url" do
	url = ""
	actual = get_movie_info(url)
	expected = {}
	expect(actual).to eq(expected)
	end

	it "should return a hash with the correct movie info" do
	url = 'https://www.imdb.com/title/tt0111161/'
	actual = get_movie_info(url)
	expected = {
	title: "The Shawshank Redemption",
	year: 1994,
	cast: ['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'],
	storyline: 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
	director: 'Frank Darabont'
	}
	expect(actual).to eq(expected)
	end
	end