Last active
April 14, 2016 16:27
-
-
Save quickgrid/d6e25a2e8f83a3089a03bf776f2e1e11 to your computer and use it in GitHub Desktop.
A linux bash shell script for searching imdb movies by name and download all relevant title ( movie ) pages. Then perform rating extraction on each of the pages and show in console.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#===============================================================================# | |
# Author: Asif Ahmed # | |
# Version: 0.2 # | |
# Site: http://quickgrid.blogspot.com # | |
# Description: IMDB Movie Search and Rating Extraction # | |
# Note: This code is highly dependent on the current page structure or, # | |
# html design of IMDB. If it changes the code will break. # | |
#===============================================================================# | |
# Read the movie name | |
read searchText | |
# Download the result page for the specified search | |
wget -O "$searchText-search.html" "http://www.imdb.com/find?q=$searchText" | |
# Run another regex to get the movie titles div and write the data to another file to avoid link filename mismatch | |
sed -e '/Titles<\/h3>/,/findMoreMatches/!d' "$searchText-search.html" > "partialContentFile.txt" | |
# Define the file where the links will be stored | |
writeLinksFileName="filesToDownload.txt" | |
# Get the movie links from html file | |
grep -E -w -o "\/title\/[a-zA-Z0-9]+\/" "partialContentFile.txt" > $writeLinksFileName | |
# Get the movie names | |
grep -P -o "(?<=>)([a-zA-Z0-9&: _-]+)(?=<\/a>[\(\) a-zA-Z0-9 _-]*\([0-9]+\))" "partialContentFile.txt" > "movieNames.txt" | |
# Get the movie years | |
grep -P -o "(?<=<\/a> )(\([0-9]+\))(?= )" "partialContentFile.txt" > "movieYears.txt" | |
# Delete contents of file | |
> "movieNameYear.txt" | |
# Use different file descriptors to read from and work with two files | |
while read -r -u3 movieName; read -r -u4 movieYear; | |
do | |
echo "$movieName" "$movieYear" >> "movieNameYear.txt" | |
done 3<movieNames.txt 4<movieYears.txt | |
# Read from the file that was written to | |
j=0 | |
while read line | |
do | |
repline=$line | |
# Replace file name spaces with underscore | |
fixedline=${repline// /_} | |
movieNameYear_array[j]=$fixedline | |
#echo ${movieNameYear_array[j]} | |
j=$(( j + 1 )) | |
done < "movieNameYear.txt" | |
# Since the link are duplicated due | |
moviefoldername=movies | |
mkdir $moviefoldername | |
i=0 | |
k=0 | |
while read line | |
do | |
temp=$(( $i % 2 )) | |
# Temporary fix when file name or file year was not extracted correctly | |
if [ $j -eq $k ]; then | |
break | |
fi | |
if [ $temp -eq 0 ]; then | |
# Each of the resultant files are downloaded here, Now read and perform rating extraction from it | |
wget -O "$moviefoldername/${movieNameYear_array[k]}" "http://www.imdb.com$line" | |
k=$(( k + 1 )) | |
fi | |
i=$(( i + 1 )) | |
done < $writeLinksFileName | |
# Now print the files in the movies directory | |
for fileName in `ls $moviefoldername/` | |
do | |
#echo "$fileName" | |
# Sample rating tag block | |
#<span itemprop="ratingValue">6.4</span></strong> | |
echo "Rating of: $fileName" | |
grep -P -o "(?<=<span itemprop=\"ratingValue\">)([0-9][.]?[0-9]?)(?=<\/span><\/strong>)" "$moviefoldername/$fileName" | |
echo "===================" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment