Teino1978-Corp · October 28, 2015 06:18
diff --git a/getter_media.sh b/getter_media.sh
 #!/bin/bash
 #script for scraping movie IDs and name ids from IMDb. This script recusively crawls IMDb pages and extracts related movie/people/character ids and stores them in txt files.
 #USAGE:	-getid  => gets movie ids into list_(imdb|name|character)_id.txt
 #		-getinfo=> gets data(XML/JSON/JSONP) and puts in store_data
 #		-print  => prints current id/info status
 #
 #Added functionality to get data from omdbapi in JSON/XML format. Tweak custom URL according to need.
 #Added functionality to get data from myapifilms in JSON/XML/JSONP format. Tweak custom URL according to need.
 #CAUTION: DON'T RUN -getid and -getinfo together in same argument rather use separate windows!!
 ########################################################################
 #CONFIG
 ########################################################################
 #get rootdir - set all paths in releavance to this dir
 filepath=`readlink -f $0`;
 rootdir=${filepath%/*};
 #echo $rootdir;

 #text file locations
 list_imdb_id="$rootdir/list_imdb_id.txt"; #media ids
 list_name_id="$rootdir/list_name_id.txt"; #people ids
 list_char_id="$rootdir/list_char_id.txt"; #character ids

 #data storage - JSON/XML
 store_format="JSON";
 store_data="$rootdir/data";

 query_imdb="www.imdb.com/";
 query_omdbapi="http://www.omdbapi.com/?r=$store_format&plot=full&tomatoes=true"; #&i=id #all info together #searches only by id/title
 query_myapifilms="http://www.myapifilms.com/imdb?actors=F&actorTrivia=0&format=$store_format&aka=1&business=1&filmography=0&movieTrivia=1&technical=1&seasons=1&trailer=1&uniqueName=1";

 ########################################################################
 #Existance Checks
 ########################################################################
 if [ -f $list_imdb_id ];
 then true; #echo "Found $list_imdb_id";
 else
 	echo "Creating $list_imdb_id";
 	touch $list_imdb_id;
 	#fill $list_imdb_id with some movie ids to start - wget imdb main page and add as many found
 	wget "www.imdb.com" -O "$rootdir/index.html";
 	cat "$rootdir/index.html" | grep -o -P "tt[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" > "$list_imdb_id"
 	rm "$rootdir/index.html";
 fi;

 if [ -f $list_name_id ];
 then true; #echo "Found $list_name_id";
 else
 	echo "Creating $list_name_id";
 	touch $list_name_id;
 	#fill $list_name_id with some name ids to start - wget imdb main page and add as many found
 	wget "www.imdb.com" -O "$rootdir/index.html";
 	cat "$rootdir/index.html" | grep -o -P "nm[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" > "$list_name_id"
 	rm "$rootdir/index.html";
 fi;

 if [ -f $list_char_id ];
 then true; #echo "Found $list_char_id";
 else
 	echo "Creating $list_char_id";
 	touch $list_char_id;
 	#fill $list_char_id with some character ids to start - wget imdb main page and add as many found
 	wget "www.imdb.com" -O "$rootdir/index.html";
 	cat "$rootdir/index.html" | grep -o -P "ch[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" > "$list_char_id";
 	rm "$rootdir/index.html";
 fi;

 if [ -d $store_data ];
 then true;	#echo "Found $store_data";
 else
 	echo "Creating $store_data";
 	mkdir $store_data;
 fi;

 #echo "Ready to run! ARGS: -getid/-getinfo/-print";
 #echo "You selected $1";

 ########################################################################
 #ARGUMENTS
 ########################################################################

 case "$1" in
 ########################################################################
 '-getid')
 	#selecting queryurl and list of ids to query
 	if [ "$2" = "title" ]; #search by titles
 	then
 		queryurl=$query_imdb"title/";
 		list_ids=$list_imdb_id;
 	elif [ "$2" = "people" ]; #search by people
 	then
 		queryurl=$query_imdb"name/";
 		list_ids=$list_name_id;
 	elif [ "$2" = "character" ]; #search by characters
 	then
 		queryurl=$query_imdb"character/";
 		list_ids=$list_char_id;
 	else 					#default = Invalid/none
 		queryurl=$query_imdb"title/";
 		list_ids=$list_imdb_id;
 	fi

 	while read line;
 	do
 		#displayinfo
 		wc -l $list_imdb_id; #KEEP TRACK HOW MANY MOVIE IDS LISTED TILL NOW.
 		wc -l $list_name_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
 		wc -l $list_char_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
 		echo "Total data:"; ls $store_data | wc -l;

 		#get some movie ids from list_imdb_id and search those pages for more, put new uniq ones back.

 		echo "Getting ids from IMDb.com...";
 		echo "QUERYURL: $queryurl$line";

 		#getting the IMDb page for id in $line from listid.txt
 		wget "$queryurl$line" --user-agent="User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0" -O "$rootdir/index.html";
 		#getting * relevant ids from the page
 		cat "$rootdir/index.html" | grep -o -P "tt[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" >> temp_imdb_id; #temp holds found ids
 		cat "$rootdir/index.html" | grep -o -P "nm[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" >> temp_name_id; #temp holds found ids
 		cat "$rootdir/index.html" | grep -o -P "ch[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" >> temp_char_id; #temp holds found ids

 		#sorting/cleaning= only keeping uniq ids
 		cat "$list_imdb_id" >> temp_imdb_id; #adding list to temp and uniq-fying
 		sort temp_imdb_id | uniq > "$list_imdb_id";

 		cat "$list_name_id" >> temp_name_id;
 		sort temp_name_id | uniq > "$list_name_id";

 		cat "$list_char_id" >> temp_char_id;
 		sort temp_char_id | uniq > "$list_char_id";

 		#remove temps
 		rm temp_*;

 		#sleep random 0-30secs #reduces server load
 		sleeptime=$RANDOM;
 		let "sleeptime %=30";
 		echo "Sleeping for "$sleeptime;
 		sleep $sleeptime;

 		echo "\n\n"
 		clear;
 		#cleanup
 		rm "$rootdir/index.html"; #[this file is html from IMDb, not relevant]
 	done < "$list_ids";
 ;;
 ########################################################################
 '-getinfo')
 	#gets info by imdb_ids
 	#get movie-id from $list_imdb_id, get info for that id from api
 	while read line;
 	do
 		#queryurl=$query_omdbapi"&i=$line";
 		queryurl=$query_myapifilms"&idIMDB=$line";
 		##displayinfo
 		wc -l $list_imdb_id; #KEEP TRACK HOW MANY MOVIE IDS LISTED TILL NOW.
 		#wc -l $list_name_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
 		#wc -l $list_char_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
 		echo "Total data:"; ls $store_data | wc -l;

 		#check if file exists @XMLholder
 		if [ -f "$store_data/$line.$store_format" ];
 		then
 			echo "File:$line.$store_format already Exists!\n\n";
 		else
 			echo "Getting info from omdbapi.com...";
 			echo "QUERYURL: $queryurl";
 			#get file from imdbapi.org
 			wget --user-agent="User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0" $queryurl -O "$store_data/$line.$store_format"; #custom=>imdbapi-doc
 			echo "File:$line.$store_format saved.";

 			#sleep random 0-30secs #reduces server load
 			sleeptime=$RANDOM;
 			let "sleeptime %=30";
 			echo "Sleeping for "$sleeptime
 			sleep $sleeptime;
 		fi
 		echo "\n\n"
 		clear;
 	done < "$list_imdb_id";
 ;;
 ########################################################################
 '-print')
 	#displayinfo
 	wc -l $list_imdb_id #KEEP TRACK HOW MANY MOVIE IDS LISTED TILL NOW.
 	wc -l $list_name_id #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
 	wc -l $list_char_id #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
 	echo "Total data:"; ls $store_data | wc -l
 ;;
 ########################################################################
 *)
 	echo "None/Invalid argument. Please check usage."
 ;;
 ########################################################################
 esac
	#!/bin/bash
	#script for scraping movie IDs and name ids from IMDb. This script recusively crawls IMDb pages and extracts related movie/people/character ids and stores them in txt files.
	#USAGE: -getid => gets movie ids into list_(imdb\|name\|character)_id.txt
	# -getinfo=> gets data(XML/JSON/JSONP) and puts in store_data
	# -print => prints current id/info status
	#
	#Added functionality to get data from omdbapi in JSON/XML format. Tweak custom URL according to need.
	#Added functionality to get data from myapifilms in JSON/XML/JSONP format. Tweak custom URL according to need.
	#CAUTION: DON'T RUN -getid and -getinfo together in same argument rather use separate windows!!
	########################################################################
	#CONFIG
	########################################################################
	#get rootdir - set all paths in releavance to this dir
	filepath=`readlink -f $0`;
	rootdir=${filepath%/*};
	#echo $rootdir;

	#text file locations
	list_imdb_id="$rootdir/list_imdb_id.txt"; #media ids
	list_name_id="$rootdir/list_name_id.txt"; #people ids
	list_char_id="$rootdir/list_char_id.txt"; #character ids

	#data storage - JSON/XML
	store_format="JSON";
	store_data="$rootdir/data";

	query_imdb="www.imdb.com/";
	query_omdbapi="http://www.omdbapi.com/?r=$store_format&plot=full&tomatoes=true"; #&i=id #all info together #searches only by id/title
	query_myapifilms="http://www.myapifilms.com/imdb?actors=F&actorTrivia=0&format=$store_format&aka=1&business=1&filmography=0&movieTrivia=1&technical=1&seasons=1&trailer=1&uniqueName=1";

	########################################################################
	#Existance Checks
	########################################################################
	if [ -f $list_imdb_id ];
	then true; #echo "Found $list_imdb_id";
	else
	echo "Creating $list_imdb_id";
	touch $list_imdb_id;
	#fill $list_imdb_id with some movie ids to start - wget imdb main page and add as many found
	wget "www.imdb.com" -O "$rootdir/index.html";
	cat "$rootdir/index.html" \| grep -o -P "tt[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" > "$list_imdb_id"
	rm "$rootdir/index.html";
	fi;

	if [ -f $list_name_id ];
	then true; #echo "Found $list_name_id";
	else
	echo "Creating $list_name_id";
	touch $list_name_id;
	#fill $list_name_id with some name ids to start - wget imdb main page and add as many found
	wget "www.imdb.com" -O "$rootdir/index.html";
	cat "$rootdir/index.html" \| grep -o -P "nm[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" > "$list_name_id"
	rm "$rootdir/index.html";
	fi;

	if [ -f $list_char_id ];
	then true; #echo "Found $list_char_id";
	else
	echo "Creating $list_char_id";
	touch $list_char_id;
	#fill $list_char_id with some character ids to start - wget imdb main page and add as many found
	wget "www.imdb.com" -O "$rootdir/index.html";
	cat "$rootdir/index.html" \| grep -o -P "ch[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" > "$list_char_id";
	rm "$rootdir/index.html";
	fi;

	if [ -d $store_data ];
	then true; #echo "Found $store_data";
	else
	echo "Creating $store_data";
	mkdir $store_data;
	fi;

	#echo "Ready to run! ARGS: -getid/-getinfo/-print";
	#echo "You selected $1";

	########################################################################
	#ARGUMENTS
	########################################################################

	case "$1" in
	########################################################################
	'-getid')
	#selecting queryurl and list of ids to query
	if [ "$2" = "title" ]; #search by titles
	then
	queryurl=$query_imdb"title/";
	list_ids=$list_imdb_id;
	elif [ "$2" = "people" ]; #search by people
	then
	queryurl=$query_imdb"name/";
	list_ids=$list_name_id;
	elif [ "$2" = "character" ]; #search by characters
	then
	queryurl=$query_imdb"character/";
	list_ids=$list_char_id;
	else #default = Invalid/none
	queryurl=$query_imdb"title/";
	list_ids=$list_imdb_id;
	fi

	while read line;
	do
	#displayinfo
	wc -l $list_imdb_id; #KEEP TRACK HOW MANY MOVIE IDS LISTED TILL NOW.
	wc -l $list_name_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
	wc -l $list_char_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
	echo "Total data:"; ls $store_data \| wc -l;

	#get some movie ids from list_imdb_id and search those pages for more, put new uniq ones back.

	echo "Getting ids from IMDb.com...";
	echo "QUERYURL: $queryurl$line";

	#getting the IMDb page for id in $line from listid.txt
	wget "$queryurl$line" --user-agent="User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0" -O "$rootdir/index.html";
	#getting * relevant ids from the page
	cat "$rootdir/index.html" \| grep -o -P "tt[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" >> temp_imdb_id; #temp holds found ids
	cat "$rootdir/index.html" \| grep -o -P "nm[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" >> temp_name_id; #temp holds found ids
	cat "$rootdir/index.html" \| grep -o -P "ch[0123456789][0123456789][0123456789][0123456789][0123456789][0123456789][0123456789]" >> temp_char_id; #temp holds found ids

	#sorting/cleaning= only keeping uniq ids
	cat "$list_imdb_id" >> temp_imdb_id; #adding list to temp and uniq-fying
	sort temp_imdb_id \| uniq > "$list_imdb_id";

	cat "$list_name_id" >> temp_name_id;
	sort temp_name_id \| uniq > "$list_name_id";

	cat "$list_char_id" >> temp_char_id;
	sort temp_char_id \| uniq > "$list_char_id";

	#remove temps
	rm temp_*;

	#sleep random 0-30secs #reduces server load
	sleeptime=$RANDOM;
	let "sleeptime %=30";
	echo "Sleeping for "$sleeptime;
	sleep $sleeptime;

	echo "\n\n"
	clear;
	#cleanup
	rm "$rootdir/index.html"; #[this file is html from IMDb, not relevant]
	done < "$list_ids";
	;;
	########################################################################
	'-getinfo')
	#gets info by imdb_ids
	#get movie-id from $list_imdb_id, get info for that id from api
	while read line;
	do
	#queryurl=$query_omdbapi"&i=$line";
	queryurl=$query_myapifilms"&idIMDB=$line";
	##displayinfo
	wc -l $list_imdb_id; #KEEP TRACK HOW MANY MOVIE IDS LISTED TILL NOW.
	#wc -l $list_name_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
	#wc -l $list_char_id; #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
	echo "Total data:"; ls $store_data \| wc -l;

	#check if file exists @XMLholder
	if [ -f "$store_data/$line.$store_format" ];
	then
	echo "File:$line.$store_format already Exists!\n\n";
	else
	echo "Getting info from omdbapi.com...";
	echo "QUERYURL: $queryurl";
	#get file from imdbapi.org
	wget --user-agent="User-Agent: Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0" $queryurl -O "$store_data/$line.$store_format"; #custom=>imdbapi-doc
	echo "File:$line.$store_format saved.";

	#sleep random 0-30secs #reduces server load
	sleeptime=$RANDOM;
	let "sleeptime %=30";
	echo "Sleeping for "$sleeptime
	sleep $sleeptime;
	fi
	echo "\n\n"
	clear;
	done < "$list_imdb_id";
	;;
	########################################################################
	'-print')
	#displayinfo
	wc -l $list_imdb_id #KEEP TRACK HOW MANY MOVIE IDS LISTED TILL NOW.
	wc -l $list_name_id #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
	wc -l $list_char_id #KEEP TRACK HOW MANY people IDS LISTED TILL NOW.
	echo "Total data:"; ls $store_data \| wc -l
	;;
	########################################################################
	*)
	echo "None/Invalid argument. Please check usage."
	;;
	########################################################################
	esac