philcryer · February 25, 2011 16:00
diff --git a/oicr.sh b/oicr.sh
 #!/bin/bash

 [[ -n "$1" ]] || { echo "Usage: oicr.sh IA_BOOK_TITLE"; exit 0 ; }

 # sample record ids: catalogueoflepid02briti electronicnaviga00unit halfhoursinfarno00newy nachrichtsblattd3234190012deut

 BOOK=$1
 BASEURL=http://cluster.biodiversitylibrary.org/n
 #BASEURL=http://www.archive.org/download

 mkdir -p  $BOOK/ocr $BOOK/xml $BOOK/djvu $BOOK/src

 if [ -f $1.djvu ]; then
 	echo "$1.djvu found, contining without download..."
 else
 	wget --user-agent="Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)" --tries=20 --span-hosts --recursive --level=1 --continue --no-parent --no-host-directories --reject html --cut-dirs=2 --execute robots=off $BASEURL/$1/$1.djvu
 fi

 cp $1.djvu $BOOK/src
 djvmcvt -i $BOOK/src/$1.djvu $BOOK/djvu index.djvu

 ls -1 $BOOK/djvu/*.djvu | grep -v index.djvu | cut -d'_' -f2 | cut -d'.' -f1 | while read PAGE_NUM
 do
 	echo -n "writing $1_$PAGE_NUM"; echo -n "	ocr..." 
 	djvused $BOOK/djvu/$1_$PAGE_NUM.djvu -e 'output-all' > $BOOK/ocr/$1_$PAGE_NUM.txt
 	echo -n "done"; echo -n "		xml..." 
 	djvutoxml $BOOK/djvu/$1_$PAGE_NUM.djvu > $BOOK/xml/$1_$PAGE_NUM.xml
 done

 echo "done"
 exit 0
	#!/bin/bash

	[[ -n "$1" ]] \|\| { echo "Usage: oicr.sh IA_BOOK_TITLE"; exit 0 ; }

	# sample record ids: catalogueoflepid02briti electronicnaviga00unit halfhoursinfarno00newy nachrichtsblattd3234190012deut

	BOOK=$1
	BASEURL=http://cluster.biodiversitylibrary.org/n
	#BASEURL=http://www.archive.org/download

	mkdir -p $BOOK/ocr $BOOK/xml $BOOK/djvu $BOOK/src

	if [ -f $1.djvu ]; then
	echo "$1.djvu found, contining without download..."
	else
	wget --user-agent="Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)" --tries=20 --span-hosts --recursive --level=1 --continue --no-parent --no-host-directories --reject html --cut-dirs=2 --execute robots=off $BASEURL/$1/$1.djvu
	fi

	cp $1.djvu $BOOK/src
	djvmcvt -i $BOOK/src/$1.djvu $BOOK/djvu index.djvu

	ls -1 $BOOK/djvu/*.djvu \| grep -v index.djvu \| cut -d'_' -f2 \| cut -d'.' -f1 \| while read PAGE_NUM
	do
	echo -n "writing $1_$PAGE_NUM"; echo -n " ocr..."
	djvused $BOOK/djvu/$1_$PAGE_NUM.djvu -e 'output-all' > $BOOK/ocr/$1_$PAGE_NUM.txt
	echo -n "done"; echo -n " xml..."
	djvutoxml $BOOK/djvu/$1_$PAGE_NUM.djvu > $BOOK/xml/$1_$PAGE_NUM.xml
	done

	echo "done"
	exit 0
No results found