stratawing · February 13, 2012 21:27
diff --git a/XLSX to ES b/XLSX to ES

 unzip -Uo 10k_records.xlsx xl/worksheets/sheet1.xml xl/sharedStrings.xml -d proc
 	cat proc/xl/sharedStrings.xml \
 	| sed -e 's/[^[:print:][:blank:][:space:]]/[?]/g' -e 's/|/-delim-/g' \
 	| mawk 'BEGIN {RS = "</t></si><si><t>|<si><t>|</t></si></sst>|</t></si><si><t xml:space=\"preserve\">"
 			FS = "999ABCDE"
                        OFS = "|"}
                        { print NR - 2, $1 }' \
 	| LANG=C sort -f -t "|" -k1 \
 	| grep -v "<sst xmlns=\|<?xml version" \
 	> keys
 	cat proc/xl/worksheets/sheet1.xml \
 	| mawk 'BEGIN {RS = "<c r=\""
 			FS = "><v>|</v></c>|\" s=\""
 			}
 			{
 			col = substr($1,1,1)
 			row = substr($1,2)
 			if($3 == "")
 				print $3, $1, col, row > "garbage"
 			else
 			if($2 ~ /.*t=\"s\".*/)
 			 	printf "%s|%s|%s\n", $3, row, col  > "strings"
 			else
 			if(col ~ /[ABCEF]/)
 			 	printf "%s|%s|%s\n", row, col, $3  > "nonstrings"
 			else
 			if(col ~ /[DGHI]/)
 			     	printf "%s|%s|%.6g\n", row, col, $3 > "nonstrings"
 			else print $3, $1, col, row > "garbage"
 			}
 		END {
 		     close("garbage")
 		     close("nonstrings")
 		     close("strings")
 		}'
 sort nonstrings > nssort
 LANG=C sort -f -t "|" -k1 strings \
 	| LANG=C join -i -e 'missing' -t '|' -1 1 -2 1 - keys \
 	| cut -d'|' -f2,3,4 \
 	| sort \
 	| sort -m - nssort \
 	> finout
 sort -g finout -k1,1n \
 	| tee finsort \
 	| cut -d'|' -f1 \
 	| sed '1d' \
 	> sortout
 paste -d'|' sortout finsort \
 	| mawk '{FS = "|"
 			if($1 != $2)
 			 	printf "%s|%s|%s\n-EOR-\n", $2, $3, $4 
 			else
 			 	printf "%s|%s|%s\n", $2, $3, $4
 		}' \
 	| tr '\n' '|' \
 	| sed -e 's/-EOR-/\n/g' \
 	| mawk '{FS = "|"
 		if(NR > 2 && $0 ~ /[^|.$]/)
 			printf "{ \"index\" : { \"_index\" : \"cerberus\", \"_type\" : \"results\"}}\n {\"upload_record_id\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\"}\n",  $2,$3,$4,$6,$7,$9,$10,$12,$13,$15,$16,$18,$19,$21,$22,$24,$25,$27,$28
 		next
 		}' \
 	| sed -e 's/,\"\":\"\"//g' \
 	| curl -XPUT 'http://localhost:9200/_bulk' -o bulk_out --data-binary @-
 rm -r proc finout nssort garbage nonstrings strings keys finsort sortout

	unzip -Uo 10k_records.xlsx xl/worksheets/sheet1.xml xl/sharedStrings.xml -d proc
	cat proc/xl/sharedStrings.xml \
	\| sed -e 's/[^[:print:][:blank:][:space:]]/[?]/g' -e 's/\|/-delim-/g' \
	\| mawk 'BEGIN {RS = "</t></si><si><t>\|<si><t>\|</t></si></sst>\|</t></si><si><t xml:space=\"preserve\">"
	FS = "999ABCDE"
	OFS = "\|"}
	{ print NR - 2, $1 }' \
	\| LANG=C sort -f -t "\|" -k1 \
	\| grep -v "<sst xmlns=\\|<?xml version" \
	> keys
	cat proc/xl/worksheets/sheet1.xml \
	\| mawk 'BEGIN {RS = "<c r=\""
	FS = "><v>\|</v></c>\|\" s=\""
	}
	{
	col = substr($1,1,1)
	row = substr($1,2)
	if($3 == "")
	print $3, $1, col, row > "garbage"
	else
	if($2 ~ /.t=\"s\"./)
	printf "%s\|%s\|%s\n", $3, row, col > "strings"
	else
	if(col ~ /[ABCEF]/)
	printf "%s\|%s\|%s\n", row, col, $3 > "nonstrings"
	else
	if(col ~ /[DGHI]/)
	printf "%s\|%s\|%.6g\n", row, col, $3 > "nonstrings"
	else print $3, $1, col, row > "garbage"
	}
	END {
	close("garbage")
	close("nonstrings")
	close("strings")
	}'
	sort nonstrings > nssort
	LANG=C sort -f -t "\|" -k1 strings \
	\| LANG=C join -i -e 'missing' -t '\|' -1 1 -2 1 - keys \
	\| cut -d'\|' -f2,3,4 \
	\| sort \
	\| sort -m - nssort \
	> finout
	sort -g finout -k1,1n \
	\| tee finsort \
	\| cut -d'\|' -f1 \
	\| sed '1d' \
	> sortout
	paste -d'\|' sortout finsort \
	\| mawk '{FS = "\|"
	if($1 != $2)
	printf "%s\|%s\|%s\n-EOR-\n", $2, $3, $4
	else
	printf "%s\|%s\|%s\n", $2, $3, $4
	}' \
	\| tr '\n' '\|' \
	\| sed -e 's/-EOR-/\n/g' \
	\| mawk '{FS = "\|"
	if(NR > 2 && $0 ~ /[^\|.$]/)
	printf "{ \"index\" : { \"_index\" : \"cerberus\", \"_type\" : \"results\"}}\n {\"upload_record_id\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\",\"%s\":\"%s\"}\n", $2,$3,$4,$6,$7,$9,$10,$12,$13,$15,$16,$18,$19,$21,$22,$24,$25,$27,$28
	next
	}' \
	\| sed -e 's/,\"\":\"\"//g' \
	\| curl -XPUT 'http://localhost:9200/_bulk' -o bulk_out --data-binary @-
	rm -r proc finout nssort garbage nonstrings strings keys finsort sortout
No results found