Skip to content

Instantly share code, notes, and snippets.

@mreid
Created July 21, 2009 04:51
Show Gist options
  • Save mreid/151116 to your computer and use it in GitHub Desktop.
Save mreid/151116 to your computer and use it in GitHub Desktop.
Bash script for doing cross validation
#!/usr/bin/perl -w
# -*-perl-*-
# $Id: crossval,v 1.1.1.1 2003/04/02 02:06:05 mreid Exp $
#
# NAME
# crossval - Run Aleph on a collection of cross validation sets.
#
# SYNOPSIS
# crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]
# EXP_BASE BACKGROUND DATA_PATTERN
# [SETTINGS_FILE] [COMMAND_FILE]
# SETUP
# These are the requirements (or at least what I have used to get this
# script to work):
# Perl V5.6.0 or later and the Getopt::Long and File::Basename modules
# These modules are pretty standard so you probably don't have to worry
# about them if you have Perl installed.
#
# Aleph Version 3 with Stasinos Konstantopoulos's [[email protected]]
# cross-validation patch and my write_rules patch.
#
# Yap 4.3.19 or higher (requires the yap -L option)
#
# USAGE
# Runs a series of Aleph sessions, with input and output is stored in files
# beginning with EXP_BASE. The background file for the task is given by
# BACKGROUND (without the '.b' extension) and the examples are contained in
# any file matching the regular expression in DATA_PATTERN and ending with
# '.f' or '.n'. For example,
# crossval experiments/exp1/test ../data/mutagenesis ../data/folds/s.*
# Will run Aleph using '../data/mutagenesis.b' as the background file,
# and the cross-validation set in '../data/folds/' that matches the pattern
# 's.*\.[fn]'.
#
# That is, if the '../data/folds/' directory contains these files:
# split1.f, split1.n, split2.f, split2.n, ... , split10.f, split10.n
# Then there will be 10 Aleph sessions run, each with a different splitN.f
# and splitN.n file held out. The results of each run will be found in the
# 'experiments/exp1/' directory named
# 'test1.out', ..., 'test10.out'.
# Any of these experiments can be rerun later by executing the files
# 'test1.in', ..., 'test10.in'.
#
# The Aleph sessions are run by building a Yap script that looks like this:
# #!/wherever/you/put/yap -L
# ... some initialization commands ...
#
# :- consult('the aleph.pl file').
# :- read_all('background', ['example base 1', ..., 'example base k']).
#
# % SETTINGS
# :- set(rulesfile, 'exp. base.rules').
# ... settings from SETTING_FILE if given ...
# ... settings from --set options ...
# ... settigns from --param options ...
#
# % COMMANDS
# ... commands from the COMMANDS_FILE or the commands below ...
# :- induce.
# :- write_rules.
#
# OPTIONS
# -a, --aleph ALEPH
# ALEPH is the location of the file containing Aleph.
# This overwrites the value stored in the ALEPH environment variable.
# -y , --yap YAP
# YAP is the location of the Yap executable.
# This overwrites the value stored in the YAP environment variable.
# --set ALEPH_SETTING
# ALEPH_SETTING is of the form setting=value. This is turned into
# a "set(setting,value)" command which is run after Aleph is loaded.
# Several settings can be made by repeating this option, eg:
# --set nodes=500 --set clauselength=3
# These will appear after (and hence overwrite) any settings in loaded
# background files or the SETTINGS_FILE.
# --param ALEPH_SETTING
# Exactly the same as --set except that the parameter is add to the
# run's filename so it can be processed later with makeplot.
# These overwrite any settings made by --set.
#
# ENVIRONMENT VARIABLES
# ALEPH
# If set, this variable's value is used as the location of the
# 'aleph.pl' file.
# YAP
# If set, this variable's value is used as the location of the
# Yap executable.
#
# CYGWIN_ROOT
# If you are using Cygwin, you must set this variable to the base
# of your Cygwin installation (eg, mine is C:/cygwin). This is
# required as Yap for Cygwin does not use the Cygwin .dll to resolve
# filenames.
#
# If you are running on Unix or Linux make sure this is not set.
#
# Note: The location of the 'aleph.pl' file and the Yap executable must either
# be set through these variables or given through the command line options above.
#
# AUTHOR
# Mark Reid
# [email protected]
#
# CHANGELOG
# 29th Sept 2002
# Fixed empty :- lines added when using command and settings files.
# Single --param was not placed into filename.
# 2nd Oct 2002
# Added --debug option.
use strict;
use Getopt::Long;
use File::Basename;
# Usage string
my $SYNOPSIS
= "crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]"
."\n EXP_BASE BACKGROUND DATA"
."\n [SETTINGS_FILE] [COMMAND_FILE]";
# Yap under Cygwin does not understand absolute Cygwin paths, so a prefix
# is grabbed from the environment to say how to convert /some/path to
# C:/cygwin/some/path.
my $CYGWIN_ROOT = defined $ENV{'CYGROOT'} ? $ENV{'CYGROOT'} : "";
# This stores all the Aleph settings
my %settings;
my %params;
my $PARALLEL = 0;
my $ALEPH = (defined $ENV{'ALEPH'} ? $ENV{'ALEPH'} : "");
my $YAP = (defined $ENV{'YAP'} ? $ENV{'YAP'} : "");
my $DEBUG = 0;
GetOptions
(
'param:s' => \%params,
'set:s' => \%settings,
'parallel' => \$PARALLEL,
'aleph:s' => \$ALEPH,
'yap:s' => \$YAP,
'debug' => \$DEBUG
);
# We must know where Aleph and Yap are
($ALEPH && $YAP) || die "Cannot run crossval without ALEPH and YAP!\n";
my ($experiment, $background, $data, $settings, $commands) = @ARGV;
# EXP_BASE, BACKGROUND and DATA_PATTERN are required
if(! $experiment || ! $background || ! $data) {
die "USAGE: $SYNOPSIS\n";
}
# Print header
print "--------------------------------------\n";
print "Experiment started at: ".localtime()."\n";
print "Yap: $YAP\nAleph: $ALEPH\n";
# Build a list of data basenames from the pattern plus .f or .n extensions
$DEBUG && print "DEBUG -- File Base: '$data'.\n";
my @datafiles = @{getFiles($data.'\.[fn]')};
$DEBUG && print "DEBUG -- Files:\n".join("\n\t",@datafiles)."\n";
my @databases;
foreach my $file (@datafiles) {
my ($base) = $file =~ /(.*)\.[f]/;
if(defined($base)) {
# If the file base has an absolute position, prepend it with the cygwin root.
if($base =~ /^\//) { $base = $CYGWIN_ROOT.$base; }
push @databases, $base;
}
}
my @exp_params; # Parameters are settings that are recorded in the filename
my @exp_settings; # Settings are not recorded in the filename.
my @exp_commands; # These are the commands to be run after the settings and params.
# Add settings from settings file. These will override any settings in the
# $background file.
if($settings) {
open(SETTINGS, "<$settings") || die "Could not read settings from '$settings'";
print "Settings File: '$settings'.\n";
push @exp_settings, "% Settings: $settings";
while(<SETTINGS>) {
chomp;
$_ && push @exp_settings, $_;
}
}
# This is the same as below but the settings are not saved in the filename.
foreach my $param (keys %settings) {
my $paramval = $settings{$param};
print "Using setting '$param' with value '$paramval'\n";
push @exp_settings, "set($param,$paramval).";
}
# Add parameters given on the command line. These override settings given in the
# $background file and in the $settings file. Parameters are just settings that
# are stored in the file name for processing by makeplot.
my @baseexts = ();
foreach my $param (keys %params) {
my $paramval = $params{$param};
print "Using parameter '$param' with value '$paramval'\n";
push @exp_settings, "set($param,$paramval).";
push @baseexts, $param."_".$paramval;
}
# Add any parameters to the experiments base name.
my $expbase = ($#baseexts > -1 ? $experiment."--".join("-", @baseexts)."--" : $experiment);
# If a commands file was specified, add it to the list of commands, otherwise
# just add "induce.". Always show the settings at the start of the run once all the
# settings have been made.
push @exp_commands, "show(settings).";
if($commands) {
open(COMMANDS, "<$commands") || die "Could not open commands file: '$commands'";
print "Commands File: '$commands'.\n";
push @exp_commands, "% From commands file: $commands";
while(<COMMANDS>) {
chomp;
$_ && push @exp_commands, $_;
}
} else {
push @exp_commands, "induce.";
push @exp_commands, "write_rules.";
}
# Run all of the cross validations
print basename($expbase).":\n";
my %runs = %{buildruns(\@databases)};
my $runnum = 0;
foreach my $holdout (keys %runs) {
$runnum++;
my $runname = $expbase.$runnum;
my $outfile = $runname.".out";
my $rulesfile = $runname.".rules";
my @run = @{$runs{$holdout}};
# These need to be run to ensure the enviroment is okay and all the files
# are loaded in.
my $pwd = $CYGWIN_ROOT.$ENV{'PWD'};
my @run_settings =
(
"path(_P).", # This is here to get around weird path bug
"add_to_path('$pwd').", # So we can consult $background
"consult('$ALEPH').",
"cd('$pwd').", # So we can read in the example files
"read_all('$background', [".join(',', map("'$_'", @run))."]).",
"set(record, true).",
"set(recordfile, '$outfile').",
"set(test_pos, '$holdout.f').",
"set(test_neg, '$holdout.n').",
"set(experiment, '$runname').",
"set(rulefile, '$rulesfile')."
);
# Write the commands to an .in file in this order:
# run_settings, exp_settings, exp_commands
my $infilename = $runname.".in";
open(COMMANDS, ">$infilename") || die "Could not open '$infilename' for writing!\n";
print COMMANDS "#!$YAP -L \n";
print COMMANDS "# BUILT: ".localtime()."\n";
my @commands = ();
foreach my $command (@run_settings, @exp_settings, @exp_commands) {
# Ignore comments
if (!($command =~ /^%/)) {
push @commands, ':- '.$command;
} else {
push @commands, $command;
}
}
print COMMANDS join("\n", @commands)."\n";
close(COMMANDS);
# Delete any old .out files that may be lying around so record doesn't append to it.
-f $outfile && system("rm $outfile");
# Run the experiment
my $time = time();
print "\tRun ".$runnum."...";
my $command = ($PARALLEL ? "enqueue " : "").$infilename.' > /dev/null 2>&1';
system("chmod +x $infilename");
system($command);
print $PARALLEL ? "enqueued." : "completed!";
$time = time() - $time;
print " (Time: $time s)\n";
}
print "Experiment completed at: ".localtime()."\n";
# getFiles
# Read in all the files matching the given pattern.
sub getFiles {
my ($filepattern) = @_;
my $dir = dirname($filepattern);
my $file = basename($filepattern);
opendir(DIR, $dir) || die "Could not open directory '$dir': $!\n";
my $pattern = qr/$file/;
my @files = grep { /^$pattern/ && -f "$dir/$_" } readdir(DIR);
@files = map { $dir."/".$_ } @files;
closedir DIR;
return \@files;
}
# buildruns
# Creates a hash of basenames to lists where the lists are a collection of datasets
# to be used in a cross validation run. The basename for each list is for file that
# was held out.
sub buildruns {
my ($dbref) = @_;
my @dbs = @{$dbref};
my %runs;
foreach my $holdout (@dbs) {
# Training files are all those files that don't match the holdout file
my @currdbs = grep (!/^$holdout$/, @dbs);
$runs{$holdout} = \@currdbs;
}
return \%runs;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment