Skip to content

Instantly share code, notes, and snippets.

@furkantektas
Created December 9, 2015 19:48
Show Gist options
  • Save furkantektas/96f877495d0665115123 to your computer and use it in GitHub Desktop.
Save furkantektas/96f877495d0665115123 to your computer and use it in GitHub Desktop.
Test and training data generator
#!/bin/sh
# This script creates a seperate training and test datasets
# using data*.log file. wsize is the window size, i.e. how
# many lines will be gathered from each data*.log file.
# This script creates 20 seperate test & training files
wsize=100
filelen=9999
for i in $(find . -name 'data*.log'); do
templen=$(wc -l < $i)
if [ "$templen" -lt "$filelen" ]; then
filelen=$templen
fi
done
if [ "$filelen" -lt "$wsize" ]; then
echo "File line length is smaller than window size"
exit 2
fi
for iter in {1..20}
do
# + 3 for skipping first 2 line
ind1beg=$(( ( RANDOM % filelen ) + 3 ))
ind2beg=$(( ( RANDOM % filelen ) + 3 ))
ind1end=$((ind1beg+wsize))
ind2end=$((ind2beg+wsize))
# swapping $ind1 and $ind2, if $ind1>$ind2
if [ "$ind1beg" -gt "$ind2beg" ]; then
ind1beg=$((ind1beg+ind2beg))
ind2beg=$((ind1beg-ind2beg))
ind1beg=$((ind1beg-ind2beg))
ind1end=$((ind1end+ind2end))
ind2end=$((ind1end-ind2end))
ind1end=$((ind1end-ind2end))
fi
# check for overlaps
if [ "$ind1end" -gt "$ind2beg" ]; then
echo "Overlap occured"
iter=$iter-1
continue
fi
trainingfilename="training-$iter-$ind1beg-$ind1end.log"
for i in $(find . -name 'data*.log'); do
sed -n ''"$ind1beg"','"$ind1end"' p' "$i" >> $trainingfilename
done
echo $trainingfilename
testfilename="test-$iter-$ind2beg-$ind2end.log"
for i in $(find . -name 'data*.log'); do
sed -n ''"$ind2beg"','"$ind2end"' p' "$i" >> $testfilename
done
echo $testfilename
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment