philcryer · October 6, 2010 02:30
diff --git a/oicr b/oicr
 #!/bin/bash

 [[ -n "${1}" ]] || { echo "Usage: oicr.sh IA_BOOK_TITLE"; exit 0 ; }

 # sample record ids: catalogueoflepid02briti electronicnaviga00unit halfhoursinfarno00newy nachrichtsblattd3234190012deut

 BOOK=${1}
 #BASEURL=http://cluster.biodiversitylibrary.org
 BASEURL=http://www.archive.org/download

 mkdir -p  ${BOOK}/txt ${BOOK}/xml ${BOOK}/djvu ${BOOK}/src

 if [ -f ${BOOK}.djvu ]; then
 	echo "${BOOK}.djvu found, continuing without download..."
 else
 	wget --user-agent="Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)" --tries=20 --span-hosts --recursive --level=1 --continue --no-parent --no-host-directories --reject html --cut-dirs=2 --execute robots=off ${BASEURL}/${BOOK}/${BOOK}.djvu
 fi

 #mv $1.djvu $BOOK/src
 cp ${1}.djvu ${BOOK}/src
 djvmcvt -i ${BOOK}/src/${BOOK}.djvu ${BOOK}/djvu index.djvu

 ls -1 ${BOOK}/djvu/*.djvu | grep -v index.djvu | cut -d'_' -f2 | cut -d'.' -f1 | while read PAGE_NUM

 do
 	echo -n "writing ${BOOK} page ${PAGE_NUM}"; echo -n "		txt..." 
 	djvused ${BOOK}/djvu/${BOOK}_${PAGE_NUM}.djvu -e 'output-all' > ${BOOK}/txt/${BOOK}_${PAGE_NUM}.txt
 	echo -n "done"; echo -n "		xml..." 
 	djvutoxml ${BOOK}/djvu/${BOOK}_${PAGE_NUM}.djvu > ${BOOK}/xml/${BOOK}_${PAGE_NUM}.xml
 	echo "done"
 done

 echo "done"
 exit 0
	#!/bin/bash

	[[ -n "${1}" ]] \|\| { echo "Usage: oicr.sh IA_BOOK_TITLE"; exit 0 ; }

	# sample record ids: catalogueoflepid02briti electronicnaviga00unit halfhoursinfarno00newy nachrichtsblattd3234190012deut

	BOOK=${1}
	#BASEURL=http://cluster.biodiversitylibrary.org
	BASEURL=http://www.archive.org/download

	mkdir -p ${BOOK}/txt ${BOOK}/xml ${BOOK}/djvu ${BOOK}/src

	if [ -f ${BOOK}.djvu ]; then
	echo "${BOOK}.djvu found, continuing without download..."
	else
	wget --user-agent="Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)" --tries=20 --span-hosts --recursive --level=1 --continue --no-parent --no-host-directories --reject html --cut-dirs=2 --execute robots=off ${BASEURL}/${BOOK}/${BOOK}.djvu
	fi

	#mv $1.djvu $BOOK/src
	cp ${1}.djvu ${BOOK}/src
	djvmcvt -i ${BOOK}/src/${BOOK}.djvu ${BOOK}/djvu index.djvu

	ls -1 ${BOOK}/djvu/*.djvu \| grep -v index.djvu \| cut -d'_' -f2 \| cut -d'.' -f1 \| while read PAGE_NUM

	do
	echo -n "writing ${BOOK} page ${PAGE_NUM}"; echo -n " txt..."
	djvused ${BOOK}/djvu/${BOOK}_${PAGE_NUM}.djvu -e 'output-all' > ${BOOK}/txt/${BOOK}_${PAGE_NUM}.txt
	echo -n "done"; echo -n " xml..."
	djvutoxml ${BOOK}/djvu/${BOOK}_${PAGE_NUM}.djvu > ${BOOK}/xml/${BOOK}_${PAGE_NUM}.xml
	echo "done"
	done

	echo "done"
	exit 0
No results found