Created
July 21, 2009 04:51
-
-
Save mreid/151116 to your computer and use it in GitHub Desktop.
Bash script for doing cross validation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# -*-perl-*- | |
# $Id: crossval,v 1.1.1.1 2003/04/02 02:06:05 mreid Exp $ | |
# | |
# NAME | |
# crossval - Run Aleph on a collection of cross validation sets. | |
# | |
# SYNOPSIS | |
# crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING] | |
# EXP_BASE BACKGROUND DATA_PATTERN | |
# [SETTINGS_FILE] [COMMAND_FILE] | |
# SETUP | |
# These are the requirements (or at least what I have used to get this | |
# script to work): | |
# Perl V5.6.0 or later and the Getopt::Long and File::Basename modules | |
# These modules are pretty standard so you probably don't have to worry | |
# about them if you have Perl installed. | |
# | |
# Aleph Version 3 with Stasinos Konstantopoulos's [[email protected]] | |
# cross-validation patch and my write_rules patch. | |
# | |
# Yap 4.3.19 or higher (requires the yap -L option) | |
# | |
# USAGE | |
# Runs a series of Aleph sessions, with input and output is stored in files | |
# beginning with EXP_BASE. The background file for the task is given by | |
# BACKGROUND (without the '.b' extension) and the examples are contained in | |
# any file matching the regular expression in DATA_PATTERN and ending with | |
# '.f' or '.n'. For example, | |
# crossval experiments/exp1/test ../data/mutagenesis ../data/folds/s.* | |
# Will run Aleph using '../data/mutagenesis.b' as the background file, | |
# and the cross-validation set in '../data/folds/' that matches the pattern | |
# 's.*\.[fn]'. | |
# | |
# That is, if the '../data/folds/' directory contains these files: | |
# split1.f, split1.n, split2.f, split2.n, ... , split10.f, split10.n | |
# Then there will be 10 Aleph sessions run, each with a different splitN.f | |
# and splitN.n file held out. The results of each run will be found in the | |
# 'experiments/exp1/' directory named | |
# 'test1.out', ..., 'test10.out'. | |
# Any of these experiments can be rerun later by executing the files | |
# 'test1.in', ..., 'test10.in'. | |
# | |
# The Aleph sessions are run by building a Yap script that looks like this: | |
# #!/wherever/you/put/yap -L | |
# ... some initialization commands ... | |
# | |
# :- consult('the aleph.pl file'). | |
# :- read_all('background', ['example base 1', ..., 'example base k']). | |
# | |
# % SETTINGS | |
# :- set(rulesfile, 'exp. base.rules'). | |
# ... settings from SETTING_FILE if given ... | |
# ... settings from --set options ... | |
# ... settigns from --param options ... | |
# | |
# % COMMANDS | |
# ... commands from the COMMANDS_FILE or the commands below ... | |
# :- induce. | |
# :- write_rules. | |
# | |
# OPTIONS | |
# -a, --aleph ALEPH | |
# ALEPH is the location of the file containing Aleph. | |
# This overwrites the value stored in the ALEPH environment variable. | |
# -y , --yap YAP | |
# YAP is the location of the Yap executable. | |
# This overwrites the value stored in the YAP environment variable. | |
# --set ALEPH_SETTING | |
# ALEPH_SETTING is of the form setting=value. This is turned into | |
# a "set(setting,value)" command which is run after Aleph is loaded. | |
# Several settings can be made by repeating this option, eg: | |
# --set nodes=500 --set clauselength=3 | |
# These will appear after (and hence overwrite) any settings in loaded | |
# background files or the SETTINGS_FILE. | |
# --param ALEPH_SETTING | |
# Exactly the same as --set except that the parameter is add to the | |
# run's filename so it can be processed later with makeplot. | |
# These overwrite any settings made by --set. | |
# | |
# ENVIRONMENT VARIABLES | |
# ALEPH | |
# If set, this variable's value is used as the location of the | |
# 'aleph.pl' file. | |
# YAP | |
# If set, this variable's value is used as the location of the | |
# Yap executable. | |
# | |
# CYGWIN_ROOT | |
# If you are using Cygwin, you must set this variable to the base | |
# of your Cygwin installation (eg, mine is C:/cygwin). This is | |
# required as Yap for Cygwin does not use the Cygwin .dll to resolve | |
# filenames. | |
# | |
# If you are running on Unix or Linux make sure this is not set. | |
# | |
# Note: The location of the 'aleph.pl' file and the Yap executable must either | |
# be set through these variables or given through the command line options above. | |
# | |
# AUTHOR | |
# Mark Reid | |
# [email protected] | |
# | |
# CHANGELOG | |
# 29th Sept 2002 | |
# Fixed empty :- lines added when using command and settings files. | |
# Single --param was not placed into filename. | |
# 2nd Oct 2002 | |
# Added --debug option. | |
use strict; | |
use Getopt::Long; | |
use File::Basename; | |
# Usage string | |
my $SYNOPSIS | |
= "crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]" | |
."\n EXP_BASE BACKGROUND DATA" | |
."\n [SETTINGS_FILE] [COMMAND_FILE]"; | |
# Yap under Cygwin does not understand absolute Cygwin paths, so a prefix | |
# is grabbed from the environment to say how to convert /some/path to | |
# C:/cygwin/some/path. | |
my $CYGWIN_ROOT = defined $ENV{'CYGROOT'} ? $ENV{'CYGROOT'} : ""; | |
# This stores all the Aleph settings | |
my %settings; | |
my %params; | |
my $PARALLEL = 0; | |
my $ALEPH = (defined $ENV{'ALEPH'} ? $ENV{'ALEPH'} : ""); | |
my $YAP = (defined $ENV{'YAP'} ? $ENV{'YAP'} : ""); | |
my $DEBUG = 0; | |
GetOptions | |
( | |
'param:s' => \%params, | |
'set:s' => \%settings, | |
'parallel' => \$PARALLEL, | |
'aleph:s' => \$ALEPH, | |
'yap:s' => \$YAP, | |
'debug' => \$DEBUG | |
); | |
# We must know where Aleph and Yap are | |
($ALEPH && $YAP) || die "Cannot run crossval without ALEPH and YAP!\n"; | |
my ($experiment, $background, $data, $settings, $commands) = @ARGV; | |
# EXP_BASE, BACKGROUND and DATA_PATTERN are required | |
if(! $experiment || ! $background || ! $data) { | |
die "USAGE: $SYNOPSIS\n"; | |
} | |
# Print header | |
print "--------------------------------------\n"; | |
print "Experiment started at: ".localtime()."\n"; | |
print "Yap: $YAP\nAleph: $ALEPH\n"; | |
# Build a list of data basenames from the pattern plus .f or .n extensions | |
$DEBUG && print "DEBUG -- File Base: '$data'.\n"; | |
my @datafiles = @{getFiles($data.'\.[fn]')}; | |
$DEBUG && print "DEBUG -- Files:\n".join("\n\t",@datafiles)."\n"; | |
my @databases; | |
foreach my $file (@datafiles) { | |
my ($base) = $file =~ /(.*)\.[f]/; | |
if(defined($base)) { | |
# If the file base has an absolute position, prepend it with the cygwin root. | |
if($base =~ /^\//) { $base = $CYGWIN_ROOT.$base; } | |
push @databases, $base; | |
} | |
} | |
my @exp_params; # Parameters are settings that are recorded in the filename | |
my @exp_settings; # Settings are not recorded in the filename. | |
my @exp_commands; # These are the commands to be run after the settings and params. | |
# Add settings from settings file. These will override any settings in the | |
# $background file. | |
if($settings) { | |
open(SETTINGS, "<$settings") || die "Could not read settings from '$settings'"; | |
print "Settings File: '$settings'.\n"; | |
push @exp_settings, "% Settings: $settings"; | |
while(<SETTINGS>) { | |
chomp; | |
$_ && push @exp_settings, $_; | |
} | |
} | |
# This is the same as below but the settings are not saved in the filename. | |
foreach my $param (keys %settings) { | |
my $paramval = $settings{$param}; | |
print "Using setting '$param' with value '$paramval'\n"; | |
push @exp_settings, "set($param,$paramval)."; | |
} | |
# Add parameters given on the command line. These override settings given in the | |
# $background file and in the $settings file. Parameters are just settings that | |
# are stored in the file name for processing by makeplot. | |
my @baseexts = (); | |
foreach my $param (keys %params) { | |
my $paramval = $params{$param}; | |
print "Using parameter '$param' with value '$paramval'\n"; | |
push @exp_settings, "set($param,$paramval)."; | |
push @baseexts, $param."_".$paramval; | |
} | |
# Add any parameters to the experiments base name. | |
my $expbase = ($#baseexts > -1 ? $experiment."--".join("-", @baseexts)."--" : $experiment); | |
# If a commands file was specified, add it to the list of commands, otherwise | |
# just add "induce.". Always show the settings at the start of the run once all the | |
# settings have been made. | |
push @exp_commands, "show(settings)."; | |
if($commands) { | |
open(COMMANDS, "<$commands") || die "Could not open commands file: '$commands'"; | |
print "Commands File: '$commands'.\n"; | |
push @exp_commands, "% From commands file: $commands"; | |
while(<COMMANDS>) { | |
chomp; | |
$_ && push @exp_commands, $_; | |
} | |
} else { | |
push @exp_commands, "induce."; | |
push @exp_commands, "write_rules."; | |
} | |
# Run all of the cross validations | |
print basename($expbase).":\n"; | |
my %runs = %{buildruns(\@databases)}; | |
my $runnum = 0; | |
foreach my $holdout (keys %runs) { | |
$runnum++; | |
my $runname = $expbase.$runnum; | |
my $outfile = $runname.".out"; | |
my $rulesfile = $runname.".rules"; | |
my @run = @{$runs{$holdout}}; | |
# These need to be run to ensure the enviroment is okay and all the files | |
# are loaded in. | |
my $pwd = $CYGWIN_ROOT.$ENV{'PWD'}; | |
my @run_settings = | |
( | |
"path(_P).", # This is here to get around weird path bug | |
"add_to_path('$pwd').", # So we can consult $background | |
"consult('$ALEPH').", | |
"cd('$pwd').", # So we can read in the example files | |
"read_all('$background', [".join(',', map("'$_'", @run))."]).", | |
"set(record, true).", | |
"set(recordfile, '$outfile').", | |
"set(test_pos, '$holdout.f').", | |
"set(test_neg, '$holdout.n').", | |
"set(experiment, '$runname').", | |
"set(rulefile, '$rulesfile')." | |
); | |
# Write the commands to an .in file in this order: | |
# run_settings, exp_settings, exp_commands | |
my $infilename = $runname.".in"; | |
open(COMMANDS, ">$infilename") || die "Could not open '$infilename' for writing!\n"; | |
print COMMANDS "#!$YAP -L \n"; | |
print COMMANDS "# BUILT: ".localtime()."\n"; | |
my @commands = (); | |
foreach my $command (@run_settings, @exp_settings, @exp_commands) { | |
# Ignore comments | |
if (!($command =~ /^%/)) { | |
push @commands, ':- '.$command; | |
} else { | |
push @commands, $command; | |
} | |
} | |
print COMMANDS join("\n", @commands)."\n"; | |
close(COMMANDS); | |
# Delete any old .out files that may be lying around so record doesn't append to it. | |
-f $outfile && system("rm $outfile"); | |
# Run the experiment | |
my $time = time(); | |
print "\tRun ".$runnum."..."; | |
my $command = ($PARALLEL ? "enqueue " : "").$infilename.' > /dev/null 2>&1'; | |
system("chmod +x $infilename"); | |
system($command); | |
print $PARALLEL ? "enqueued." : "completed!"; | |
$time = time() - $time; | |
print " (Time: $time s)\n"; | |
} | |
print "Experiment completed at: ".localtime()."\n"; | |
# getFiles | |
# Read in all the files matching the given pattern. | |
sub getFiles { | |
my ($filepattern) = @_; | |
my $dir = dirname($filepattern); | |
my $file = basename($filepattern); | |
opendir(DIR, $dir) || die "Could not open directory '$dir': $!\n"; | |
my $pattern = qr/$file/; | |
my @files = grep { /^$pattern/ && -f "$dir/$_" } readdir(DIR); | |
@files = map { $dir."/".$_ } @files; | |
closedir DIR; | |
return \@files; | |
} | |
# buildruns | |
# Creates a hash of basenames to lists where the lists are a collection of datasets | |
# to be used in a cross validation run. The basename for each list is for file that | |
# was held out. | |
sub buildruns { | |
my ($dbref) = @_; | |
my @dbs = @{$dbref}; | |
my %runs; | |
foreach my $holdout (@dbs) { | |
# Training files are all those files that don't match the holdout file | |
my @currdbs = grep (!/^$holdout$/, @dbs); | |
$runs{$holdout} = \@currdbs; | |
} | |
return \%runs; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment