Skip to content

Instantly share code, notes, and snippets.

@deepcube
Last active December 17, 2015 09:09
Show Gist options
  • Save deepcube/5585737 to your computer and use it in GitHub Desktop.
Save deepcube/5585737 to your computer and use it in GitHub Desktop.
Given a file, print each word followed by byte offsets at which it is found. offsets.c and offsets.bash should be interchangeable. ./offsets < file | awk -f offsets.awk | sort | column -t
# offsets.awk
#
# given a list of word offset pairs, accumulate offsets and print
# word [offset,...] pairs
# use with offsets.c
# no good way to shebang awk, so just do awk -f manually
{
a[$1] = a[$1] (a[$1] ? "," : "") $2;
}
END {
for (w in a)
printf("%s [%s]\n", w, a[w]);
}
#!/usr/bin/env bash
#
# offsets.bash
#
# for each word (grouping of alphanumeric characters) print the word and the
# byte offset into the file
# combine with offsets.awk to get a listing of word [offset,...] pairs
#
unset word off
while IFS= read -r -d '' -n 1 c; do
if [[ $c == [[:alnum:]] ]]; then
word+=$c
elif [[ $word ]]; then
printf "%s %zu\n" "$word" "$((off - ${#word}))"
unset word
fi
((off++))
done
#include <ctype.h>
#include <stdio.h>
/*
* offsets.c
*
* for each word (grouping of alphanumeric characters) print the word and the
* byte offset into the file
* combine with offsets.awk to get a listing of word [offset,...] pairs
*/
int main(void)
{
size_t off;
char c, buf[256], *p = buf;
for (off = 0; (c = fgetc(stdin)) != EOF; off++) {
if (isalnum(c)) {
*p++ = c;
} else if (p != buf) {
*p = '\0';
printf("%s %zu\n", buf, off - (p - buf));
p = buf;
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment