kanaka · December 30, 2015 12:18
diff --git a/tokenizer.sh b/tokenizer.sh
 #!/bin/bash

 wholefile=$(cat $1)
 filelen=${#wholefile}

 idx=0
 chunk=0
 chunksz=500
 while true; do
    if (( ${#str} < ( chunksz / 2) )) && (( chunk < filelen )); then
        str="${str}${wholefile:${chunk}:${chunksz}}"
        chunk=$(( chunk + ${chunksz} ))
    fi
    (( ${#str} == 0 )) && break
    [[ "${str}" =~ ^(\"[^\"]+\")|^([\(\)])|^([^ \"\(\)]+)|^[[:space:]]+ ]]
    match=${BASH_REMATCH[0]}
    str="${str:${#match}}"
    [ -n "${match# }" ] && echo "${match}"
    if [ -z "${BASH_REMATCH[0]}" ]; then
        echo >&2 "Error at: ${str:0:50}"
        exit 2
    fi
 done

 # much faster:
 # sed 's/\("[^"][^"]*"\)\|\([()]\)\|\([^ "()][^ "()]*\)\|  */<\1.\2.\3>\n/g' $1
	#!/bin/bash

	wholefile=$(cat $1)
	filelen=${#wholefile}

	idx=0
	chunk=0
	chunksz=500
	while true; do
	if (( ${#str} < ( chunksz / 2) )) && (( chunk < filelen )); then
	str="${str}${wholefile:${chunk}:${chunksz}}"
	chunk=$(( chunk + ${chunksz} ))
	fi
	(( ${#str} == 0 )) && break
	[[ "${str}" =~ ^(\"[^\"]+\")\|^([\(\)])\|^([^ \"\(\)]+)\|^[[:space:]]+ ]]
	match=${BASH_REMATCH[0]}
	str="${str:${#match}}"
	[ -n "${match# }" ] && echo "${match}"
	if [ -z "${BASH_REMATCH[0]}" ]; then
	echo >&2 "Error at: ${str:0:50}"
	exit 2
	fi
	done

	# much faster:
	# sed 's/\("[^"][^"]"\)\\|\([()]\)\\|\([^ "()][^ "()]\)\\| */<\1.\2.\3>\n/g' $1
No results found