greggirwin · July 29, 2017 20:16
diff --git a/rawk.r b/rawk.r
 REBOL [
    Title:      "AWK in REBOL"
    File:       %rawk.r
    Author:     "Gregg Irwin"
    Version:    0.0.8
    Comment: {
        Inspired by AWK. The core design is very appealing to me: a data
        driven app that processes text files automatically iterating over
        files and lines, and splitting lines up into fields. I don't care
        for much of the AWK syntax, though, so I'm not shooting for
        compatibility with that.
    }
    History: [
        0.0.1 [12-sep-2001 {Initial release}]
        0.0.2 [22-mar-2002
            {Added try block around file reading code in case a file
            is bad, or not a file (e.g. a directory).}
            {Added next-line, next-file, and exit routines.}
            {__ _ _fnr and _filename are no longer cleared when exec
            is complete. Now that you can exit, you might want to
            interrogate them to see what the final results were.}
        ]
        0.0.3 [10-Apr-2002
            {The exec 'files parameter has been renamed to 'sources.}
            {The sources block may now contain file!, string!, or block!
            values. Strings will be parsed and blocks iterated. If a
            source is not a file!, _filename will be the type of source.}
        ]
        0.0.4 [7-May-2002
            {Added LINE and FIELDS words which map to __ and _ respectively.}
        ]
        0.0.5 [14-Oct-2006
            {Major update. Uses file-list now, eliminated old 'lib model too.}
            {Changed names, so the public function is now 'rawk.}
            {Sources now needs to be either a file-list spec or a block of files.}
        ]
        0.0.6 [31-Oct-2006
            {Added option to pass in a file or url as the program. It will
            be LOADed from the target in those cases.}
        ]
        0.0.7 [13-Dec-2006
            {After much thought, I reversed the sources and program param order in the RAWK func.}
        ]
        0.0.8 [20-Apr-2007
            {RAWK func now returns just the most recent exec time}
        ]
    ]
 ]

 ; AWK statements must either be on separate lines, terminated by
 ; a semicolon, or both. If we force patterns to be a paren!, or
 ; actions to be a double-block [[...]], we can identify them
 ; easily, but that's not as friendly to use. We could also make
 ; each "statment" a block, with the last value being the action,
 ; if it's a block!; any other final value indicates the default 
 ; action should be used. That means the main rawk program will
 ; be a double block if there's only one statement in it.

 rawk-dialect-ctx: context [
    =pattern: =action: none

    def-pattern: 'every-line
    def-action:  [print __]

    pattern=: []
    action=:  [set =action block!]
    program=: [
        some [
            (=pattern: =action: none)
            pattern= action= | pattern= | action=
        ]
    ]

    find-new-line: func [block [block!]] [
        forall block [if new-line? block [return block]]
        either new-line? tail block [tail block] [none]
    ]

    parse-new-lines: func [
        "Parse a block into sub-blocks where new-lines occur"
        block /local result res last-pos next-pos
    ] [
        ; we need to do this, or the original gets modified, because,
        ; as we copy out sub-blocks, we remove line markers.
        block: copy block
        result: copy []
        last-pos: block
        next-pos: find-new-line block
        while [next-pos] [
            if not empty? res: copy/part new-line back last-pos off next-pos [
                append/only result res
            ]
            last-pos: next next-pos
            next-pos: find-new-line last-pos
        ]
        append/only result copy new-line back last-pos off
        result
    ]

 ]

 rawk-ctx: context [

    usage: {
        line    "current unparsed line/record"
        __      "alias for 'line"
        fields  "current parsed line/record"
        _       "alias for 'fields"
        _nr     "total number of lines/records read so far"
        _fnr    "record number in current file"
        _nf     "number of fields in current record"
        _filename "name of file currently being processed"
        _fs     "field separator"
        _rs     "record separator"
        _exec-times "exec call run times"
    }

    ;-- Simple Words

    every-line: true ; replaces the 'all rule.
    line:   __: none ; current unparsed line/record
    fields:  _: none ; current parsed line/record
    _nr:  0          ; total number of lines/records read so far
    _fnr: 0          ; record number in current file
    _nf:  does [length? _] ; num-fields in current record
    _filename: none  ; name of file, or type, currently being processed
    _fs:  none       ; field separator
    _rs:  "^/"       ; record separator
 ;    _ofs: " "       ; output field separator
 ;    _ors: newline   ; output record separator
    _exec-times: copy [] ; exec call run times
    ;!! Don't mess with these directly. Use the next-line, next-file,
    ;   and exit functions.
    _next-line: false   ; flag for breaking to next line read
    _next-file: false   ; flag for breaking to next file
    _exit:      false   ; flag for stopping further data reads


    ;-- Words that do something

    field: func [n [integer!]] [pick _ n]

    all-blocks?: func [series [block! any-string!]] [
        all [block? series   parse series [some [any-block!]]]
    ]

    file-list?: func [series [block! any-string!]] [
        all [block? series   parse series [some [file! | url!]]]
    ]

    next-line: func [
        "Stop rule evaluations for current line, and read the next line."
    ][
        _next-line: true
    ]

    next-file: func [
        "Stop rule evaluations for current file, and read the next file."
    ][
        _next-file: true
    ]

    exit: func [
        {Stop reading data. END and END_FILE actions will still fire.
        If you don't want them to fire, use system/words/exit.}
    ][
        _exit: true
    ]

    _extract-std-action: func [
        {FOR INTERNAL USE ONLY! Extracts the action for the specified rule
         from the program and returns it.}
        prog [block!]
        rule [word!]
        /local action
    ][
        if found? find prog rule [
            action: prog/:rule
            remove/part find prog rule 2
            return action
        ]
    ]

    set 'rawk func [
        {Execute the RAWK 'program' contained in the prog block for each file
         contained in the files block. This is what you call to use RAWK.}
        sources [block! any-string!]
        program [block! file! url!]
        /deep "Recurse through sub-directories when looking for files"
        /local prog std-actions start-time lines err
    ][
        __: _: _filename: line: fields: none
        _fnr: _nr: 0
        _exit: false

        ; Load the file list if it isn't a pre-populated list of
        ; files they're giving us.
        if all [not file-list? sources  not all-blocks? sources] [
            sources: any [
                either deep [file-list/deep sources] [file-list sources]
                ; Fallback; if file-list failed, and it wasn't a block
                ; of blocks, we'll assume that items are either file
                ; names (to be loaded), strings (to be parsed), or blocks
                ; to be acted on directly. We'll use series? to check,
                ; since that covers all the string and block types.
                remove-each item copy sources [not series? item]
            ]
        ]

        if any [file? program  url? program] [program: load program]

        ; If we use bind here, they can just use _/1 in their
        ; rules, rather than rawk-ctx/_/1.
        ; We're going to mod the prog block here, so we let
        ; bind/copy do a deep copy for us.
        prog: bind/copy program 'self

        ; Extract the special rules from the program for later use.
        ; All that's left in our program block are the rules that
        ; we need to evaluate for each line we process.
        std-actions: reduce [
            'begin      _extract-std-action prog 'begin
            'end        _extract-std-action prog 'end
            'begin-file _extract-std-action prog 'begin-file
            'end-file   _extract-std-action prog 'end-file
        ]

        ; Reset total'ing line counter
        _nr: 0
        ; Do BEGIN action
        do std-actions/begin

        start-time: now/time/precise
        foreach source sources [
            ; Clear file break flag
            _next-file: false
            ; Reset per-file line counter
            _fnr: 0
            ; Set the current filename being processed
            _filename: either file? source [source] [type? source]
            ; Do BEGIN-FILE action
            do std-actions/begin-file

            ;?? Should I be using /direct here for large files?
            ;   It slows us down a little bit, but might be worth it.
            if error? set/any 'err try [
                ; 10-Apr-2002 Now you can pass in files, strings, or blocks.
                lines: either any [file? source  url? source] [
                    read/lines/with source _rs
                ][
                    either any-string? source [parse/all source _rs] [source]
                ]
                foreach line* lines [
                    ; Clear line break flag
                    _next-line: false
                    ; Increment line/record counters
                    _fnr: add _fnr 1
                    _nr: add _nr 1
                    ; Set references to the current line in both parsed
                    ; and unparsed forms.
                    __: line: line*
                    ;!! might want to allow them to set _fs to " " as well.
                    _: fields: either _fs [parse/all line* _fs] [parse line* _fs]

                    ; *** THE CORE ***
                    ; Evaluate every rule in the program and 'do its associated
                    ; action if necessary. The 'every-line rule matches every
                    ; line in a file.
                    foreach [rule action] prog [
                        if do rule [
                            do action
                            if any [_next-line _next-file _exit] [break]
                        ]
                    ]

                    ; If they use next-file, or exit, in their action, it will
                    ; make us break out of our read loop on the current file.
                    if any [_next-file _exit] [break]
                ]
            ][
                print ["RAWK - error reading" _filename "at line" _fnr newline mold __ mold disarm err]
            ]
            do std-actions/end-file ; Do END-FILE action
            if _exit [break]
        ]
        do std-actions/end  ; Do END action

        ; 22-mar-2002 Data fields no longer cleared.
        ; Clear out data that's not useful out of context.
        ; Don't clear _nr, because that could be useful.
        ;__: _: _fnr: _filename: none

        ; Log the time we spent working
        append _exec-times now/time/precise - start-time
        last _exec-times
    ]

 ]
	REBOL [
	Title: "AWK in REBOL"
	File: %rawk.r
	Author: "Gregg Irwin"
	Version: 0.0.8
	Comment: {
	Inspired by AWK. The core design is very appealing to me: a data
	driven app that processes text files automatically iterating over
	files and lines, and splitting lines up into fields. I don't care
	for much of the AWK syntax, though, so I'm not shooting for
	compatibility with that.
	}
	History: [
	0.0.1 [12-sep-2001 {Initial release}]
	0.0.2 [22-mar-2002
	{Added try block around file reading code in case a file
	is bad, or not a file (e.g. a directory).}
	{Added next-line, next-file, and exit routines.}
	{__ _ _fnr and _filename are no longer cleared when exec
	is complete. Now that you can exit, you might want to
	interrogate them to see what the final results were.}
	]
	0.0.3 [10-Apr-2002
	{The exec 'files parameter has been renamed to 'sources.}
	{The sources block may now contain file!, string!, or block!
	values. Strings will be parsed and blocks iterated. If a
	source is not a file!, _filename will be the type of source.}
	]
	0.0.4 [7-May-2002
	{Added LINE and FIELDS words which map to __ and _ respectively.}
	]
	0.0.5 [14-Oct-2006
	{Major update. Uses file-list now, eliminated old 'lib model too.}
	{Changed names, so the public function is now 'rawk.}
	{Sources now needs to be either a file-list spec or a block of files.}
	]
	0.0.6 [31-Oct-2006
	{Added option to pass in a file or url as the program. It will
	be LOADed from the target in those cases.}
	]
	0.0.7 [13-Dec-2006
	{After much thought, I reversed the sources and program param order in the RAWK func.}
	]
	0.0.8 [20-Apr-2007
	{RAWK func now returns just the most recent exec time}
	]
	]
	]

	; AWK statements must either be on separate lines, terminated by
	; a semicolon, or both. If we force patterns to be a paren!, or
	; actions to be a double-block [[...]], we can identify them
	; easily, but that's not as friendly to use. We could also make
	; each "statment" a block, with the last value being the action,
	; if it's a block!; any other final value indicates the default
	; action should be used. That means the main rawk program will
	; be a double block if there's only one statement in it.

	rawk-dialect-ctx: context [
	=pattern: =action: none

	def-pattern: 'every-line
	def-action: [print __]

	pattern=: []
	action=: [set =action block!]
	program=: [
	some [
	(=pattern: =action: none)
	pattern= action= \| pattern= \| action=
	]
	]

	find-new-line: func [block [block!]] [
	forall block [if new-line? block [return block]]
	either new-line? tail block [tail block] [none]
	]

	parse-new-lines: func [
	"Parse a block into sub-blocks where new-lines occur"
	block /local result res last-pos next-pos
	] [
	; we need to do this, or the original gets modified, because,
	; as we copy out sub-blocks, we remove line markers.
	block: copy block
	result: copy []
	last-pos: block
	next-pos: find-new-line block
	while [next-pos] [
	if not empty? res: copy/part new-line back last-pos off next-pos [
	append/only result res
	]
	last-pos: next next-pos
	next-pos: find-new-line last-pos
	]
	append/only result copy new-line back last-pos off
	result
	]

	]

	rawk-ctx: context [

	usage: {
	line "current unparsed line/record"
	__ "alias for 'line"
	fields "current parsed line/record"
	_ "alias for 'fields"
	_nr "total number of lines/records read so far"
	_fnr "record number in current file"
	_nf "number of fields in current record"
	_filename "name of file currently being processed"
	_fs "field separator"
	_rs "record separator"
	_exec-times "exec call run times"
	}

	;-- Simple Words

	every-line: true ; replaces the 'all rule.
	line: __: none ; current unparsed line/record
	fields: _: none ; current parsed line/record
	_nr: 0 ; total number of lines/records read so far
	_fnr: 0 ; record number in current file
	_nf: does [length? _] ; num-fields in current record
	_filename: none ; name of file, or type, currently being processed
	_fs: none ; field separator
	_rs: "^/" ; record separator
	; _ofs: " " ; output field separator
	; _ors: newline ; output record separator
	_exec-times: copy [] ; exec call run times
	;!! Don't mess with these directly. Use the next-line, next-file,
	; and exit functions.
	_next-line: false ; flag for breaking to next line read
	_next-file: false ; flag for breaking to next file
	_exit: false ; flag for stopping further data reads


	;-- Words that do something

	field: func [n [integer!]] [pick _ n]

	all-blocks?: func [series [block! any-string!]] [
	all [block? series parse series [some [any-block!]]]
	]

	file-list?: func [series [block! any-string!]] [
	all [block? series parse series [some [file! \| url!]]]
	]

	next-line: func [
	"Stop rule evaluations for current line, and read the next line."
	][
	_next-line: true
	]

	next-file: func [
	"Stop rule evaluations for current file, and read the next file."
	][
	_next-file: true
	]

	exit: func [
	{Stop reading data. END and END_FILE actions will still fire.
	If you don't want them to fire, use system/words/exit.}
	][
	_exit: true
	]

	_extract-std-action: func [
	{FOR INTERNAL USE ONLY! Extracts the action for the specified rule
	from the program and returns it.}
	prog [block!]
	rule [word!]
	/local action
	][
	if found? find prog rule [
	action: prog/:rule
	remove/part find prog rule 2
	return action
	]
	]

	set 'rawk func [
	{Execute the RAWK 'program' contained in the prog block for each file
	contained in the files block. This is what you call to use RAWK.}
	sources [block! any-string!]
	program [block! file! url!]
	/deep "Recurse through sub-directories when looking for files"
	/local prog std-actions start-time lines err
	][
	__: _: _filename: line: fields: none
	_fnr: _nr: 0
	_exit: false

	; Load the file list if it isn't a pre-populated list of
	; files they're giving us.
	if all [not file-list? sources not all-blocks? sources] [
	sources: any [
	either deep [file-list/deep sources] [file-list sources]
	; Fallback; if file-list failed, and it wasn't a block
	; of blocks, we'll assume that items are either file
	; names (to be loaded), strings (to be parsed), or blocks
	; to be acted on directly. We'll use series? to check,
	; since that covers all the string and block types.
	remove-each item copy sources [not series? item]
	]
	]

	if any [file? program url? program] [program: load program]

	; If we use bind here, they can just use _/1 in their
	; rules, rather than rawk-ctx/_/1.
	; We're going to mod the prog block here, so we let
	; bind/copy do a deep copy for us.
	prog: bind/copy program 'self

	; Extract the special rules from the program for later use.
	; All that's left in our program block are the rules that
	; we need to evaluate for each line we process.
	std-actions: reduce [
	'begin _extract-std-action prog 'begin
	'end _extract-std-action prog 'end
	'begin-file _extract-std-action prog 'begin-file
	'end-file _extract-std-action prog 'end-file
	]

	; Reset total'ing line counter
	_nr: 0
	; Do BEGIN action
	do std-actions/begin

	start-time: now/time/precise
	foreach source sources [
	; Clear file break flag
	_next-file: false
	; Reset per-file line counter
	_fnr: 0
	; Set the current filename being processed
	_filename: either file? source [source] [type? source]
	; Do BEGIN-FILE action
	do std-actions/begin-file

	;?? Should I be using /direct here for large files?
	; It slows us down a little bit, but might be worth it.
	if error? set/any 'err try [
	; 10-Apr-2002 Now you can pass in files, strings, or blocks.
	lines: either any [file? source url? source] [
	read/lines/with source _rs
	][
	either any-string? source [parse/all source _rs] [source]
	]
	foreach line* lines [
	; Clear line break flag
	_next-line: false
	; Increment line/record counters
	_fnr: add _fnr 1
	_nr: add _nr 1
	; Set references to the current line in both parsed
	; and unparsed forms.
	__: line: line*
	;!! might want to allow them to set _fs to " " as well.
	_: fields: either _fs [parse/all line* _fs] [parse line* _fs]

	; * THE CORE *
	; Evaluate every rule in the program and 'do its associated
	; action if necessary. The 'every-line rule matches every
	; line in a file.
	foreach [rule action] prog [
	if do rule [
	do action
	if any [_next-line _next-file _exit] [break]
	]
	]

	; If they use next-file, or exit, in their action, it will
	; make us break out of our read loop on the current file.
	if any [_next-file _exit] [break]
	]
	][
	print ["RAWK - error reading" _filename "at line" _fnr newline mold __ mold disarm err]
	]
	do std-actions/end-file ; Do END-FILE action
	if _exit [break]
	]
	do std-actions/end ; Do END action

	; 22-mar-2002 Data fields no longer cleared.
	; Clear out data that's not useful out of context.
	; Don't clear _nr, because that could be useful.
	;__: _: _fnr: _filename: none

	; Log the time we spent working
	append _exec-times now/time/precise - start-time
	last _exec-times
	]

	]