Created
June 28, 2012 14:40
-
-
Save kmizumar/3011731 to your computer and use it in GitHub Desktop.
text file splitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env gosh | |
(use gauche.parseopt) | |
(use math.mt-random) | |
(use util.match) | |
(define (p . args) (for-each print args)) | |
(define (usage) | |
(p (format #f "Usage: ~a srcfile ratio trainfile testfile" *program-name*) | |
"Example" | |
(format #f " % ~a data.csv 0.7 train.csv test.csv" *program-name*) | |
" will split data.csv into train.csv and test.csv" | |
" 70% of lines will be kept in train.csv and the rest 30% will be kept in test.csv" | |
" lines are selected randomly") | |
(exit 0)) | |
(define (split-file in ratio out0 out1) | |
(let1 m (make <mersenne-twister> :seed (sys-time)) | |
(generator-for-each | |
(^[line] | |
(format (if (< (mt-random-real m) ratio) out0 out1) "~a~%" line)) | |
(^[] (read-line in)))) | |
(close-output-port out0) | |
(close-output-port out1)) | |
;; Entry point | |
(define (main args) | |
(let-args (cdr args) | |
([#f "h|help" => usage] | |
[else (opt . _) (print "Unknown option : " opt) (usage)] | |
. args) | |
(match args | |
[(src ratio train test) | |
(call-with-input-file src | |
(^p (split-file p (x->number ratio) | |
(open-output-file train :if-exists :supersede) | |
(open-output-file test :if-exists :supersede))) | |
:if-does-not-exist :error)] | |
[else (usage)]))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment