quickgrid · April 14, 2016 16:27
diff --git a/imdbMovieSearchRatingExtract.sh b/imdbMovieSearchRatingExtract.sh
 #!/bin/bash

 #===============================================================================#
 # Author:	Asif Ahmed		                                        #
 # Version:	0.2					                        #
 # Site:         http://quickgrid.blogspot.com                                   #
 # Description:  IMDB Movie Search and Rating Extraction	                        #
 # Note:		This code is highly dependent on the current page structure or, #
 #	      	html design of IMDB. If it changes the code will break.         #
 #===============================================================================#


 # Read the movie name
 read searchText

 # Download the result page for the specified search
 wget -O "$searchText-search.html" "http://www.imdb.com/find?q=$searchText"


 # Run another regex to get the movie titles div and write the data to another file to avoid link filename mismatch
 sed -e '/Titles<\/h3>/,/findMoreMatches/!d' "$searchText-search.html" > "partialContentFile.txt"



 # Define the file where the links will be stored
 writeLinksFileName="filesToDownload.txt"


 # Get the movie links from html file
 grep -E -w -o "\/title\/[a-zA-Z0-9]+\/" "partialContentFile.txt" > $writeLinksFileName

 # Get the movie names
 grep -P -o "(?<=>)([a-zA-Z0-9&: _-]+)(?=<\/a>[\(\) a-zA-Z0-9 _-]*\([0-9]+\))" "partialContentFile.txt" > "movieNames.txt" 

 # Get the movie years
 grep -P -o "(?<=<\/a> )(\([0-9]+\))(?= )" "partialContentFile.txt" > "movieYears.txt" 

 # Delete contents of file
 > "movieNameYear.txt"

 # Use different file descriptors to read from and work with two files
 while read -r -u3 movieName; read -r -u4 movieYear;
 do 
 	echo "$movieName" "$movieYear" >> "movieNameYear.txt"
 done 3<movieNames.txt 4<movieYears.txt


 # Read from the file that was written to
 j=0

 while read line
 do
 	repline=$line
 	
 	# Replace file name spaces with underscore
 	fixedline=${repline// /_}	

 	movieNameYear_array[j]=$fixedline
 	#echo ${movieNameYear_array[j]}
 	j=$(( j + 1 ))
 done < "movieNameYear.txt"



 # Since the link are duplicated due 

 moviefoldername=movies
 mkdir $moviefoldername

 i=0
 k=0

 while read line
 do
 	temp=$(( $i % 2 )) 
 	
 	# Temporary fix when file name or file year was not extracted correctly 
 	if [ $j -eq $k ]; then
 		break
 	fi
 	
 	if [ $temp -eq 0 ]; then
 		
 		# Each of the resultant files are downloaded here, Now read and perform rating extraction from it
 		wget -O "$moviefoldername/${movieNameYear_array[k]}" "http://www.imdb.com$line"
 		k=$(( k + 1 ))

 	fi

 	i=$(( i + 1 ))

 done < $writeLinksFileName


 # Now print the files in the movies directory

 for fileName in `ls $moviefoldername/`
 do
 	#echo "$fileName"
 	
 	# Sample rating tag block
 	#<span itemprop="ratingValue">6.4</span></strong>

 	echo "Rating of: $fileName"	
 	grep -P -o "(?<=<span itemprop=\"ratingValue\">)([0-9][.]?[0-9]?)(?=<\/span><\/strong>)" "$moviefoldername/$fileName" 
 	echo "===================" 

 done
	#!/bin/bash

	#===============================================================================#
	# Author: Asif Ahmed #
	# Version: 0.2 #
	# Site: http://quickgrid.blogspot.com #
	# Description: IMDB Movie Search and Rating Extraction #
	# Note: This code is highly dependent on the current page structure or, #
	# html design of IMDB. If it changes the code will break. #
	#===============================================================================#


	# Read the movie name
	read searchText

	# Download the result page for the specified search
	wget -O "$searchText-search.html" "http://www.imdb.com/find?q=$searchText"


	# Run another regex to get the movie titles div and write the data to another file to avoid link filename mismatch
	sed -e '/Titles<\/h3>/,/findMoreMatches/!d' "$searchText-search.html" > "partialContentFile.txt"



	# Define the file where the links will be stored
	writeLinksFileName="filesToDownload.txt"


	# Get the movie links from html file
	grep -E -w -o "\/title\/[a-zA-Z0-9]+\/" "partialContentFile.txt" > $writeLinksFileName

	# Get the movie names
	grep -P -o "(?<=>)([a-zA-Z0-9&: _-]+)(?=<\/a>[\(\) a-zA-Z0-9 _-]*\([0-9]+\))" "partialContentFile.txt" > "movieNames.txt"

	# Get the movie years
	grep -P -o "(?<=<\/a> )(\([0-9]+\))(?= )" "partialContentFile.txt" > "movieYears.txt"

	# Delete contents of file
	> "movieNameYear.txt"

	# Use different file descriptors to read from and work with two files
	while read -r -u3 movieName; read -r -u4 movieYear;
	do
	echo "$movieName" "$movieYear" >> "movieNameYear.txt"
	done 3<movieNames.txt 4<movieYears.txt


	# Read from the file that was written to
	j=0

	while read line
	do
	repline=$line

	# Replace file name spaces with underscore
	fixedline=${repline// /_}

	movieNameYear_array[j]=$fixedline
	#echo ${movieNameYear_array[j]}
	j=$(( j + 1 ))
	done < "movieNameYear.txt"



	# Since the link are duplicated due

	moviefoldername=movies
	mkdir $moviefoldername

	i=0
	k=0

	while read line
	do
	temp=$(( $i % 2 ))

	# Temporary fix when file name or file year was not extracted correctly
	if [ $j -eq $k ]; then
	break
	fi

	if [ $temp -eq 0 ]; then

	# Each of the resultant files are downloaded here, Now read and perform rating extraction from it
	wget -O "$moviefoldername/${movieNameYear_array[k]}" "http://www.imdb.com$line"
	k=$(( k + 1 ))

	fi

	i=$(( i + 1 ))

	done < $writeLinksFileName


	# Now print the files in the movies directory

	for fileName in `ls $moviefoldername/`
	do
	#echo "$fileName"

	# Sample rating tag block
	#<span itemprop="ratingValue">6.4</span></strong>

	echo "Rating of: $fileName"
	grep -P -o "(?<=<span itemprop=\"ratingValue\">)([0-9][.]?[0-9]?)(?=<\/span><\/strong>)" "$moviefoldername/$fileName"
	echo "==================="

	done