Last active
December 17, 2015 09:09
-
-
Save deepcube/5585737 to your computer and use it in GitHub Desktop.
Given a file, print each word followed by byte offsets at which it is found. offsets.c and offsets.bash should be interchangeable. ./offsets < file | awk -f offsets.awk | sort | column -t
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# offsets.awk | |
# | |
# given a list of word offset pairs, accumulate offsets and print | |
# word [offset,...] pairs | |
# use with offsets.c | |
# no good way to shebang awk, so just do awk -f manually | |
{ | |
a[$1] = a[$1] (a[$1] ? "," : "") $2; | |
} | |
END { | |
for (w in a) | |
printf("%s [%s]\n", w, a[w]); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# offsets.bash | |
# | |
# for each word (grouping of alphanumeric characters) print the word and the | |
# byte offset into the file | |
# combine with offsets.awk to get a listing of word [offset,...] pairs | |
# | |
unset word off | |
while IFS= read -r -d '' -n 1 c; do | |
if [[ $c == [[:alnum:]] ]]; then | |
word+=$c | |
elif [[ $word ]]; then | |
printf "%s %zu\n" "$word" "$((off - ${#word}))" | |
unset word | |
fi | |
((off++)) | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <ctype.h> | |
#include <stdio.h> | |
/* | |
* offsets.c | |
* | |
* for each word (grouping of alphanumeric characters) print the word and the | |
* byte offset into the file | |
* combine with offsets.awk to get a listing of word [offset,...] pairs | |
*/ | |
int main(void) | |
{ | |
size_t off; | |
char c, buf[256], *p = buf; | |
for (off = 0; (c = fgetc(stdin)) != EOF; off++) { | |
if (isalnum(c)) { | |
*p++ = c; | |
} else if (p != buf) { | |
*p = '\0'; | |
printf("%s %zu\n", buf, off - (p - buf)); | |
p = buf; | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment