Skip to content

Instantly share code, notes, and snippets.

@ericwastaken
Created April 23, 2020 19:19
Show Gist options
  • Save ericwastaken/083a039e022b4196fedad52e2dfa3416 to your computer and use it in GitHub Desktop.
Save ericwastaken/083a039e022b4196fedad52e2dfa3416 to your computer and use it in GitHub Desktop.
Random Sample
#!/bin/bash
############################################################################
# Random Sample - rsample.sh
#
# A utility to output a sample number of lines from
# a text file based on a percentage passed in as an
# argument.
#
# Dependencies:
# - shuf (http://man7.org/linux/man-pages/man1/shuf.1.html)
# - In macOS, shuf is part of core-utils. Install with
# `brew install coreutils`.
#
# Tested on macOS with zsh.
#
# Copyright 2020 Eric A. Soto, [email protected]
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
# - The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
############################################################################
# Helpers
# Shows a banner to be used in other error messages.
function showBanner() {
echo "Random Sample - $0"
}
# Outputs the syntax of the command
function showSyntax() {
showBanner
echo "Syntax: $0 [percent] [/path/to/file]"
echo " percent - an integer, percent of the total file to output randomly."
echo " file - The full path to the file to sample."
echo "Output is to stdout. Redirect to save to file."
echo ""
}
# Used to report that the input file is not present
function missingFile() {
showBanner
echo "Missing file: $1"
}
# Let's do this!
# Sanity - we expect two args
if [ $# -lt 2 ]; then
showSyntax
exit -1
fi
# Assign incoming arguments
PERCENT=$1
INFILEPATH=$2
# Sanity - is file present?
if [ ! -e $INFILEPATH ]; then
missingFile $INFILEPATH
exit -2;
fi
# ASSERT: We have two arguments, and the file exists.
# Figure out how many lines the file has
lineCount=$(wc -l < $INFILEPATH)
# Calc percent of lines (note, this does integer math by default!)
let sampleCount=lineCount*$PERCENT/100
# Output sampleCount
shuf -n $sampleCount $INFILEPATH
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment