Created
July 29, 2017 20:16
-
-
Save greggirwin/6397bdfe8e9e7dd9f3d543bc4b0e570c to your computer and use it in GitHub Desktop.
RAWK in R2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REBOL [ | |
Title: "AWK in REBOL" | |
File: %rawk.r | |
Author: "Gregg Irwin" | |
Version: 0.0.8 | |
Comment: { | |
Inspired by AWK. The core design is very appealing to me: a data | |
driven app that processes text files automatically iterating over | |
files and lines, and splitting lines up into fields. I don't care | |
for much of the AWK syntax, though, so I'm not shooting for | |
compatibility with that. | |
} | |
History: [ | |
0.0.1 [12-sep-2001 {Initial release}] | |
0.0.2 [22-mar-2002 | |
{Added try block around file reading code in case a file | |
is bad, or not a file (e.g. a directory).} | |
{Added next-line, next-file, and exit routines.} | |
{__ _ _fnr and _filename are no longer cleared when exec | |
is complete. Now that you can exit, you might want to | |
interrogate them to see what the final results were.} | |
] | |
0.0.3 [10-Apr-2002 | |
{The exec 'files parameter has been renamed to 'sources.} | |
{The sources block may now contain file!, string!, or block! | |
values. Strings will be parsed and blocks iterated. If a | |
source is not a file!, _filename will be the type of source.} | |
] | |
0.0.4 [7-May-2002 | |
{Added LINE and FIELDS words which map to __ and _ respectively.} | |
] | |
0.0.5 [14-Oct-2006 | |
{Major update. Uses file-list now, eliminated old 'lib model too.} | |
{Changed names, so the public function is now 'rawk.} | |
{Sources now needs to be either a file-list spec or a block of files.} | |
] | |
0.0.6 [31-Oct-2006 | |
{Added option to pass in a file or url as the program. It will | |
be LOADed from the target in those cases.} | |
] | |
0.0.7 [13-Dec-2006 | |
{After much thought, I reversed the sources and program param order in the RAWK func.} | |
] | |
0.0.8 [20-Apr-2007 | |
{RAWK func now returns just the most recent exec time} | |
] | |
] | |
] | |
; AWK statements must either be on separate lines, terminated by | |
; a semicolon, or both. If we force patterns to be a paren!, or | |
; actions to be a double-block [[...]], we can identify them | |
; easily, but that's not as friendly to use. We could also make | |
; each "statment" a block, with the last value being the action, | |
; if it's a block!; any other final value indicates the default | |
; action should be used. That means the main rawk program will | |
; be a double block if there's only one statement in it. | |
rawk-dialect-ctx: context [ | |
=pattern: =action: none | |
def-pattern: 'every-line | |
def-action: [print __] | |
pattern=: [] | |
action=: [set =action block!] | |
program=: [ | |
some [ | |
(=pattern: =action: none) | |
pattern= action= | pattern= | action= | |
] | |
] | |
find-new-line: func [block [block!]] [ | |
forall block [if new-line? block [return block]] | |
either new-line? tail block [tail block] [none] | |
] | |
parse-new-lines: func [ | |
"Parse a block into sub-blocks where new-lines occur" | |
block /local result res last-pos next-pos | |
] [ | |
; we need to do this, or the original gets modified, because, | |
; as we copy out sub-blocks, we remove line markers. | |
block: copy block | |
result: copy [] | |
last-pos: block | |
next-pos: find-new-line block | |
while [next-pos] [ | |
if not empty? res: copy/part new-line back last-pos off next-pos [ | |
append/only result res | |
] | |
last-pos: next next-pos | |
next-pos: find-new-line last-pos | |
] | |
append/only result copy new-line back last-pos off | |
result | |
] | |
] | |
rawk-ctx: context [ | |
usage: { | |
line "current unparsed line/record" | |
__ "alias for 'line" | |
fields "current parsed line/record" | |
_ "alias for 'fields" | |
_nr "total number of lines/records read so far" | |
_fnr "record number in current file" | |
_nf "number of fields in current record" | |
_filename "name of file currently being processed" | |
_fs "field separator" | |
_rs "record separator" | |
_exec-times "exec call run times" | |
} | |
;-- Simple Words | |
every-line: true ; replaces the 'all rule. | |
line: __: none ; current unparsed line/record | |
fields: _: none ; current parsed line/record | |
_nr: 0 ; total number of lines/records read so far | |
_fnr: 0 ; record number in current file | |
_nf: does [length? _] ; num-fields in current record | |
_filename: none ; name of file, or type, currently being processed | |
_fs: none ; field separator | |
_rs: "^/" ; record separator | |
; _ofs: " " ; output field separator | |
; _ors: newline ; output record separator | |
_exec-times: copy [] ; exec call run times | |
;!! Don't mess with these directly. Use the next-line, next-file, | |
; and exit functions. | |
_next-line: false ; flag for breaking to next line read | |
_next-file: false ; flag for breaking to next file | |
_exit: false ; flag for stopping further data reads | |
;-- Words that do something | |
field: func [n [integer!]] [pick _ n] | |
all-blocks?: func [series [block! any-string!]] [ | |
all [block? series parse series [some [any-block!]]] | |
] | |
file-list?: func [series [block! any-string!]] [ | |
all [block? series parse series [some [file! | url!]]] | |
] | |
next-line: func [ | |
"Stop rule evaluations for current line, and read the next line." | |
][ | |
_next-line: true | |
] | |
next-file: func [ | |
"Stop rule evaluations for current file, and read the next file." | |
][ | |
_next-file: true | |
] | |
exit: func [ | |
{Stop reading data. END and END_FILE actions will still fire. | |
If you don't want them to fire, use system/words/exit.} | |
][ | |
_exit: true | |
] | |
_extract-std-action: func [ | |
{FOR INTERNAL USE ONLY! Extracts the action for the specified rule | |
from the program and returns it.} | |
prog [block!] | |
rule [word!] | |
/local action | |
][ | |
if found? find prog rule [ | |
action: prog/:rule | |
remove/part find prog rule 2 | |
return action | |
] | |
] | |
set 'rawk func [ | |
{Execute the RAWK 'program' contained in the prog block for each file | |
contained in the files block. This is what you call to use RAWK.} | |
sources [block! any-string!] | |
program [block! file! url!] | |
/deep "Recurse through sub-directories when looking for files" | |
/local prog std-actions start-time lines err | |
][ | |
__: _: _filename: line: fields: none | |
_fnr: _nr: 0 | |
_exit: false | |
; Load the file list if it isn't a pre-populated list of | |
; files they're giving us. | |
if all [not file-list? sources not all-blocks? sources] [ | |
sources: any [ | |
either deep [file-list/deep sources] [file-list sources] | |
; Fallback; if file-list failed, and it wasn't a block | |
; of blocks, we'll assume that items are either file | |
; names (to be loaded), strings (to be parsed), or blocks | |
; to be acted on directly. We'll use series? to check, | |
; since that covers all the string and block types. | |
remove-each item copy sources [not series? item] | |
] | |
] | |
if any [file? program url? program] [program: load program] | |
; If we use bind here, they can just use _/1 in their | |
; rules, rather than rawk-ctx/_/1. | |
; We're going to mod the prog block here, so we let | |
; bind/copy do a deep copy for us. | |
prog: bind/copy program 'self | |
; Extract the special rules from the program for later use. | |
; All that's left in our program block are the rules that | |
; we need to evaluate for each line we process. | |
std-actions: reduce [ | |
'begin _extract-std-action prog 'begin | |
'end _extract-std-action prog 'end | |
'begin-file _extract-std-action prog 'begin-file | |
'end-file _extract-std-action prog 'end-file | |
] | |
; Reset total'ing line counter | |
_nr: 0 | |
; Do BEGIN action | |
do std-actions/begin | |
start-time: now/time/precise | |
foreach source sources [ | |
; Clear file break flag | |
_next-file: false | |
; Reset per-file line counter | |
_fnr: 0 | |
; Set the current filename being processed | |
_filename: either file? source [source] [type? source] | |
; Do BEGIN-FILE action | |
do std-actions/begin-file | |
;?? Should I be using /direct here for large files? | |
; It slows us down a little bit, but might be worth it. | |
if error? set/any 'err try [ | |
; 10-Apr-2002 Now you can pass in files, strings, or blocks. | |
lines: either any [file? source url? source] [ | |
read/lines/with source _rs | |
][ | |
either any-string? source [parse/all source _rs] [source] | |
] | |
foreach line* lines [ | |
; Clear line break flag | |
_next-line: false | |
; Increment line/record counters | |
_fnr: add _fnr 1 | |
_nr: add _nr 1 | |
; Set references to the current line in both parsed | |
; and unparsed forms. | |
__: line: line* | |
;!! might want to allow them to set _fs to " " as well. | |
_: fields: either _fs [parse/all line* _fs] [parse line* _fs] | |
; *** THE CORE *** | |
; Evaluate every rule in the program and 'do its associated | |
; action if necessary. The 'every-line rule matches every | |
; line in a file. | |
foreach [rule action] prog [ | |
if do rule [ | |
do action | |
if any [_next-line _next-file _exit] [break] | |
] | |
] | |
; If they use next-file, or exit, in their action, it will | |
; make us break out of our read loop on the current file. | |
if any [_next-file _exit] [break] | |
] | |
][ | |
print ["RAWK - error reading" _filename "at line" _fnr newline mold __ mold disarm err] | |
] | |
do std-actions/end-file ; Do END-FILE action | |
if _exit [break] | |
] | |
do std-actions/end ; Do END action | |
; 22-mar-2002 Data fields no longer cleared. | |
; Clear out data that's not useful out of context. | |
; Don't clear _nr, because that could be useful. | |
;__: _: _fnr: _filename: none | |
; Log the time we spent working | |
append _exec-times now/time/precise - start-time | |
last _exec-times | |
] | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment