Created
December 9, 2015 19:48
-
-
Save furkantektas/96f877495d0665115123 to your computer and use it in GitHub Desktop.
Test and training data generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# This script creates a seperate training and test datasets | |
# using data*.log file. wsize is the window size, i.e. how | |
# many lines will be gathered from each data*.log file. | |
# This script creates 20 seperate test & training files | |
wsize=100 | |
filelen=9999 | |
for i in $(find . -name 'data*.log'); do | |
templen=$(wc -l < $i) | |
if [ "$templen" -lt "$filelen" ]; then | |
filelen=$templen | |
fi | |
done | |
if [ "$filelen" -lt "$wsize" ]; then | |
echo "File line length is smaller than window size" | |
exit 2 | |
fi | |
for iter in {1..20} | |
do | |
# + 3 for skipping first 2 line | |
ind1beg=$(( ( RANDOM % filelen ) + 3 )) | |
ind2beg=$(( ( RANDOM % filelen ) + 3 )) | |
ind1end=$((ind1beg+wsize)) | |
ind2end=$((ind2beg+wsize)) | |
# swapping $ind1 and $ind2, if $ind1>$ind2 | |
if [ "$ind1beg" -gt "$ind2beg" ]; then | |
ind1beg=$((ind1beg+ind2beg)) | |
ind2beg=$((ind1beg-ind2beg)) | |
ind1beg=$((ind1beg-ind2beg)) | |
ind1end=$((ind1end+ind2end)) | |
ind2end=$((ind1end-ind2end)) | |
ind1end=$((ind1end-ind2end)) | |
fi | |
# check for overlaps | |
if [ "$ind1end" -gt "$ind2beg" ]; then | |
echo "Overlap occured" | |
iter=$iter-1 | |
continue | |
fi | |
trainingfilename="training-$iter-$ind1beg-$ind1end.log" | |
for i in $(find . -name 'data*.log'); do | |
sed -n ''"$ind1beg"','"$ind1end"' p' "$i" >> $trainingfilename | |
done | |
echo $trainingfilename | |
testfilename="test-$iter-$ind2beg-$ind2end.log" | |
for i in $(find . -name 'data*.log'); do | |
sed -n ''"$ind2beg"','"$ind2end"' p' "$i" >> $testfilename | |
done | |
echo $testfilename | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment