Created
February 16, 2011 05:28
-
-
Save rzezeski/828918 to your computer and use it in GitHub Desktop.
Pull a random sample of lines from the files in a directory (http://www.progski.net/blog/2011/random_lines_in_bash.html)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /bin/bash | |
| # | |
| # Pull a random sample of size 1000 from a directory of files and | |
| # write to standard out. | |
| # | |
| # Usage: | |
| # ./sample-lines /path/to/dir > mysample.log | |
| if [ $# -lt 1 ] | |
| then | |
| echo "Usage: $0 PATH" >&2 | |
| fi | |
| TMP=$(mktemp -t sample_lines.XXXX) | |
| trap "rm -f $TMP" EXIT | |
| DIR=$1 | |
| LC=$(wc -l $DIR/* | grep total | grep -oE [[:digit:]]+) | |
| echo pulling 1000 samples from $LC lines in $DIR >&2 | |
| rand() | |
| { | |
| od -N4 -An -tu4 /dev/urandom \ | |
| | sed -n 1p \ | |
| | awk -v lim="$LC" '{printf "%i\n", 0 + (lim * ($1 / 4294967296))}' | |
| } | |
| for i in {1..1000} | |
| do | |
| echo $(rand)p >> $TMP | |
| done | |
| sed -n -f "$TMP" $DIR/* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment