mreid · July 21, 2009 04:51
diff --git a/crossval.sh b/crossval.sh
 #!/usr/bin/perl -w
 # -*-perl-*-
 # $Id: crossval,v 1.1.1.1 2003/04/02 02:06:05 mreid Exp $
 #
 # NAME
 # 		crossval - Run Aleph on a collection of cross validation sets.
 #
 # SYNOPSIS
 #		crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]
 #				 EXP_BASE BACKGROUND DATA_PATTERN
 #				 [SETTINGS_FILE] [COMMAND_FILE]
 # SETUP
 #		These are the requirements (or at least what I have used to get this
 #		script to work):
 #			Perl V5.6.0 or later and the Getopt::Long and File::Basename modules
 #			These modules are pretty standard so you probably don't have to worry
 #			about them if you have Perl installed.
 #
 #			Aleph Version 3 with Stasinos Konstantopoulos's [[email protected]]
 #			cross-validation patch and my write_rules patch.
 #
 #			Yap 4.3.19 or higher (requires the yap -L option)
 #
 # USAGE
 #		Runs a series of Aleph sessions, with input and output is stored in files
 #		beginning with EXP_BASE. The background file for the task is given by
 #		BACKGROUND (without the '.b' extension) and the examples are contained in
 #		any file matching the regular expression in DATA_PATTERN and ending with
 #		'.f' or '.n'. For example,
 #			crossval experiments/exp1/test ../data/mutagenesis ../data/folds/s.*
 #		Will run Aleph using '../data/mutagenesis.b' as the background file,
 #		and the cross-validation set in '../data/folds/' that matches the pattern 
 #		's.*\.[fn]'.
 #
 #		That is, if the '../data/folds/' directory contains these files:
 #			split1.f, split1.n, split2.f, split2.n, ... , split10.f, split10.n
 #		Then there will be 10 Aleph sessions run, each with a different splitN.f
 #		and splitN.n file held out. The results of each run will be found in the 
 #		'experiments/exp1/' directory named
 #			'test1.out', ..., 'test10.out'.
 #		Any of these experiments can be rerun later by executing the files 
 #			'test1.in', ..., 'test10.in'.
 #			
 #		The Aleph sessions are run by building a Yap script that looks like this:
 #			#!/wherever/you/put/yap -L
 #			... some initialization commands ...
 #
 #			:- consult('the aleph.pl file').
 #			:- read_all('background', ['example base 1', ..., 'example base k']).
 #			
 #			% SETTINGS
 #			:- set(rulesfile, 'exp. base.rules').
 #			... settings from SETTING_FILE if given ...
 #			... settings from --set options ...
 #			... settigns from --param options ...
 #
 #			% COMMANDS
 #			... commands from the COMMANDS_FILE or the commands below ...
 #			:- induce.
 #			:- write_rules.
 #
 # OPTIONS
 #		-a, --aleph ALEPH
 #			ALEPH is the location of the file containing Aleph.
 #			This overwrites the value stored in the ALEPH environment variable.
 #		-y , --yap YAP
 #			YAP is the location of the Yap executable.
 #			This overwrites the value stored in the YAP environment variable.
 #		--set ALEPH_SETTING
 #			ALEPH_SETTING is of the form setting=value. This is turned into
 #			a "set(setting,value)" command which is run after Aleph is loaded.
 #			Several settings can be made by repeating this option, eg:
 #				--set nodes=500 --set clauselength=3
 #			These will appear after (and hence overwrite) any settings in loaded
 #			background files or the SETTINGS_FILE.
 #		--param ALEPH_SETTING
 #			Exactly the same as --set except that the parameter is add to the
 #			run's filename so it can be processed later with makeplot.
 #			These overwrite any settings made by --set.
 #
 # ENVIRONMENT VARIABLES
 #		ALEPH
 #			If set, this variable's value is used as the location of the 
 #			'aleph.pl' file.
 #		YAP
 #			If set, this variable's value is used as the location of the 
 #			Yap executable.
 #			
 #		CYGWIN_ROOT
 #			If you are using Cygwin, you must set this variable to the base
 #			of your Cygwin installation (eg, mine is C:/cygwin). This is
 #			required as Yap for Cygwin does not use the Cygwin .dll to resolve
 #			filenames.
 #
 #			If you are running on Unix or Linux make sure this is not set.
 #
 #	Note: The location of the 'aleph.pl' file and the Yap executable must either
 #	be set through these variables or given through the command line options above.
 #			
 # AUTHOR
 # 	Mark Reid
 #	[email protected]
 #
 # CHANGELOG
 #	29th Sept 2002
 #		Fixed empty :- lines added when using command and settings files.
 #		Single --param was not placed into filename.
 #   2nd Oct 2002
 #		Added --debug option.
 use strict;
 use Getopt::Long;
 use File::Basename;

 # Usage string
 my $SYNOPSIS 
  = "crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]"
   ."\n         EXP_BASE BACKGROUND DATA"
   ."\n         [SETTINGS_FILE] [COMMAND_FILE]";

 # Yap under Cygwin does not understand absolute Cygwin paths, so a prefix
 # is grabbed from the environment to say how to convert /some/path to
 # C:/cygwin/some/path.
 my $CYGWIN_ROOT = defined $ENV{'CYGROOT'} ? $ENV{'CYGROOT'} : "";

 # This stores all the Aleph settings
 my %settings;
 my %params;
 my $PARALLEL = 0;
 my $ALEPH = (defined $ENV{'ALEPH'} ? $ENV{'ALEPH'} : "");
 my $YAP = (defined $ENV{'YAP'} ? $ENV{'YAP'} : "");
 my $DEBUG = 0;

 GetOptions
  (
   'param:s'	=> \%params,
   'set:s'		=> \%settings,
   'parallel'	=> \$PARALLEL,
   'aleph:s'	=> \$ALEPH,
   'yap:s'		=> \$YAP,
   'debug'		=> \$DEBUG
  );

 # We must know where Aleph and Yap are
 ($ALEPH && $YAP) || die "Cannot run crossval without ALEPH and YAP!\n";

 my ($experiment, $background, $data, $settings, $commands) = @ARGV;

 # EXP_BASE, BACKGROUND and DATA_PATTERN are required
 if(! $experiment || ! $background || ! $data) {
  die "USAGE: $SYNOPSIS\n";
 }

 # Print header
 print "--------------------------------------\n";
 print "Experiment started at: ".localtime()."\n";
 print "Yap: $YAP\nAleph: $ALEPH\n";

 # Build a list of data basenames from the pattern plus .f or .n extensions
 $DEBUG && print "DEBUG -- File Base: '$data'.\n";
 my @datafiles = @{getFiles($data.'\.[fn]')};
 $DEBUG && print "DEBUG -- Files:\n".join("\n\t",@datafiles)."\n";

 my @databases;
 foreach my $file (@datafiles) {
  my ($base) = $file =~ /(.*)\.[f]/;
  if(defined($base)) {
 	# If the file base has an absolute position, prepend it with the cygwin root.
 	if($base =~ /^\//) { $base = $CYGWIN_ROOT.$base; }
 	push @databases, $base;
  }
 }

 my @exp_params;		# Parameters are settings that are recorded in the filename
 my @exp_settings;	# Settings are not recorded in the filename.
 my @exp_commands;	# These are the commands to be run after the settings and params.

 # Add settings from settings file. These will override any settings in the
 # $background file.
 if($settings) {
  open(SETTINGS, "<$settings") || die "Could not read settings from '$settings'";
  print "Settings File: '$settings'.\n";
  push @exp_settings, "% Settings: $settings";
  while(<SETTINGS>) {
 	chomp;
 	$_ && push @exp_settings, $_;
  }
 }

 # This is the same as below but the settings are not saved in the filename.
 foreach my $param (keys %settings) {
 	my $paramval = $settings{$param};

 	print "Using setting '$param' with value '$paramval'\n";
 	push @exp_settings, "set($param,$paramval).";
 }

 # Add parameters given on the command line. These override settings given in the
 # $background file and in the $settings file. Parameters are just settings that
 # are stored in the file name for processing by makeplot.
 my @baseexts = ();
 foreach my $param (keys %params) {
 	my $paramval = $params{$param};

 	print "Using parameter '$param' with value '$paramval'\n";
 	push @exp_settings, "set($param,$paramval).";
 	push @baseexts, $param."_".$paramval;
 }

 # Add any parameters to the experiments base name.
 my $expbase = ($#baseexts > -1 ? $experiment."--".join("-", @baseexts)."--" : $experiment);

 # If a commands file was specified, add it to the list of commands, otherwise
 # just add "induce.". Always show the settings at the start of the run once all the
 # settings have been made.
 push @exp_commands, "show(settings).";
 if($commands) {
  open(COMMANDS, "<$commands") || die "Could not open commands file: '$commands'";
  print "Commands File: '$commands'.\n";
  push @exp_commands, "% From commands file: $commands";
  while(<COMMANDS>) {
 	chomp;
 	$_ && push @exp_commands, $_;
  }
 } else {
  push @exp_commands, "induce.";
  push @exp_commands, "write_rules.";
 }


 # Run all of the cross validations
 print basename($expbase).":\n";
 my %runs = %{buildruns(\@databases)};
 my $runnum = 0;
 foreach my $holdout (keys %runs) {
  $runnum++;
  my $runname = $expbase.$runnum;
  my $outfile = $runname.".out";
  my $rulesfile = $runname.".rules";
  my @run = @{$runs{$holdout}};

  # These need to be run to ensure the enviroment is okay and all the files
  # are loaded in.
  my $pwd = $CYGWIN_ROOT.$ENV{'PWD'};
  my @run_settings =
 	(
 	 "path(_P).",				# This is here to get around weird path bug
 	 "add_to_path('$pwd').",	# So we can consult $background
 	 "consult('$ALEPH').",
 	 "cd('$pwd').",				# So we can read in the example files
 	 "read_all('$background', [".join(',', map("'$_'", @run))."]).",
 	 "set(record, true).",
 	 "set(recordfile, '$outfile').",
 	 "set(test_pos, '$holdout.f').",
 	 "set(test_neg, '$holdout.n').",
 	 "set(experiment, '$runname').",
 	 "set(rulefile, '$rulesfile')."
 	);

  # Write the commands to an .in file in this order:
  # 	run_settings, exp_settings, exp_commands
  my $infilename = $runname.".in";
  open(COMMANDS, ">$infilename") || die "Could not open '$infilename' for writing!\n";
  print COMMANDS "#!$YAP -L \n";
  print COMMANDS "# BUILT: ".localtime()."\n";
  my @commands = ();
  foreach my $command (@run_settings, @exp_settings, @exp_commands) {
 	# Ignore comments
 	if (!($command =~ /^%/)) {
 	  push @commands, ':- '.$command;
 	} else {
 	  push @commands, $command;
 	}
  }
  print COMMANDS join("\n", @commands)."\n";
  close(COMMANDS);

  # Delete any old .out files that may be lying around so record doesn't append to it.
  -f $outfile && system("rm $outfile");

  # Run the experiment
  my $time = time();
  print "\tRun ".$runnum."...";

  my $command = ($PARALLEL ? "enqueue " : "").$infilename.' > /dev/null 2>&1';
  system("chmod +x $infilename");
  system($command);

  print $PARALLEL ? "enqueued." : "completed!";
  $time = time() - $time;
  print " (Time: $time s)\n";
 }

 print "Experiment completed at: ".localtime()."\n";

 # getFiles
 # Read in all the files matching the given pattern.
 sub getFiles {
  my ($filepattern) = @_;
  my $dir = dirname($filepattern);
  my $file = basename($filepattern);

  opendir(DIR, $dir) || die "Could not open directory '$dir': $!\n";
  my $pattern = qr/$file/;
  my @files = grep { /^$pattern/ && -f "$dir/$_" } readdir(DIR);
  @files = map { $dir."/".$_ } @files;
  closedir DIR;

  return \@files;
 }

 # buildruns
 # Creates a hash of basenames to lists where the lists are a collection of datasets
 # to be used in a cross validation run. The basename for each list is for file that
 # was held out.
 sub buildruns {
  my ($dbref) = @_;
  my @dbs = @{$dbref};

  my %runs;

  foreach my $holdout (@dbs) {
 	# Training files are all those files that don't match the holdout file
 	my @currdbs = grep (!/^$holdout$/, @dbs);
 	$runs{$holdout} = \@currdbs;
  }

  return \%runs;
 }
	#!/usr/bin/perl -w
	# --perl--
	# $Id: crossval,v 1.1.1.1 2003/04/02 02:06:05 mreid Exp $
	#
	# NAME
	# crossval - Run Aleph on a collection of cross validation sets.
	#
	# SYNOPSIS
	# crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]
	# EXP_BASE BACKGROUND DATA_PATTERN
	# [SETTINGS_FILE] [COMMAND_FILE]
	# SETUP
	# These are the requirements (or at least what I have used to get this
	# script to work):
	# Perl V5.6.0 or later and the Getopt::Long and File::Basename modules
	# These modules are pretty standard so you probably don't have to worry
	# about them if you have Perl installed.
	#
	# Aleph Version 3 with Stasinos Konstantopoulos's [[email protected]]
	# cross-validation patch and my write_rules patch.
	#
	# Yap 4.3.19 or higher (requires the yap -L option)
	#
	# USAGE
	# Runs a series of Aleph sessions, with input and output is stored in files
	# beginning with EXP_BASE. The background file for the task is given by
	# BACKGROUND (without the '.b' extension) and the examples are contained in
	# any file matching the regular expression in DATA_PATTERN and ending with
	# '.f' or '.n'. For example,
	# crossval experiments/exp1/test ../data/mutagenesis ../data/folds/s.*
	# Will run Aleph using '../data/mutagenesis.b' as the background file,
	# and the cross-validation set in '../data/folds/' that matches the pattern
	# 's.*\.[fn]'.
	#
	# That is, if the '../data/folds/' directory contains these files:
	# split1.f, split1.n, split2.f, split2.n, ... , split10.f, split10.n
	# Then there will be 10 Aleph sessions run, each with a different splitN.f
	# and splitN.n file held out. The results of each run will be found in the
	# 'experiments/exp1/' directory named
	# 'test1.out', ..., 'test10.out'.
	# Any of these experiments can be rerun later by executing the files
	# 'test1.in', ..., 'test10.in'.
	#
	# The Aleph sessions are run by building a Yap script that looks like this:
	# #!/wherever/you/put/yap -L
	# ... some initialization commands ...
	#
	# :- consult('the aleph.pl file').
	# :- read_all('background', ['example base 1', ..., 'example base k']).
	#
	# % SETTINGS
	# :- set(rulesfile, 'exp. base.rules').
	# ... settings from SETTING_FILE if given ...
	# ... settings from --set options ...
	# ... settigns from --param options ...
	#
	# % COMMANDS
	# ... commands from the COMMANDS_FILE or the commands below ...
	# :- induce.
	# :- write_rules.
	#
	# OPTIONS
	# -a, --aleph ALEPH
	# ALEPH is the location of the file containing Aleph.
	# This overwrites the value stored in the ALEPH environment variable.
	# -y , --yap YAP
	# YAP is the location of the Yap executable.
	# This overwrites the value stored in the YAP environment variable.
	# --set ALEPH_SETTING
	# ALEPH_SETTING is of the form setting=value. This is turned into
	# a "set(setting,value)" command which is run after Aleph is loaded.
	# Several settings can be made by repeating this option, eg:
	# --set nodes=500 --set clauselength=3
	# These will appear after (and hence overwrite) any settings in loaded
	# background files or the SETTINGS_FILE.
	# --param ALEPH_SETTING
	# Exactly the same as --set except that the parameter is add to the
	# run's filename so it can be processed later with makeplot.
	# These overwrite any settings made by --set.
	#
	# ENVIRONMENT VARIABLES
	# ALEPH
	# If set, this variable's value is used as the location of the
	# 'aleph.pl' file.
	# YAP
	# If set, this variable's value is used as the location of the
	# Yap executable.
	#
	# CYGWIN_ROOT
	# If you are using Cygwin, you must set this variable to the base
	# of your Cygwin installation (eg, mine is C:/cygwin). This is
	# required as Yap for Cygwin does not use the Cygwin .dll to resolve
	# filenames.
	#
	# If you are running on Unix or Linux make sure this is not set.
	#
	# Note: The location of the 'aleph.pl' file and the Yap executable must either
	# be set through these variables or given through the command line options above.
	#
	# AUTHOR
	# Mark Reid
	# [email protected]
	#
	# CHANGELOG
	# 29th Sept 2002
	# Fixed empty :- lines added when using command and settings files.
	# Single --param was not placed into filename.
	# 2nd Oct 2002
	# Added --debug option.
	use strict;
	use Getopt::Long;
	use File::Basename;

	# Usage string
	my $SYNOPSIS
	= "crossval [--aleph ALEPH] [--yap YAP] [--param ALEPH_SETTING] [--set ALEPH_SETTING]"
	."\n EXP_BASE BACKGROUND DATA"
	."\n [SETTINGS_FILE] [COMMAND_FILE]";

	# Yap under Cygwin does not understand absolute Cygwin paths, so a prefix
	# is grabbed from the environment to say how to convert /some/path to
	# C:/cygwin/some/path.
	my $CYGWIN_ROOT = defined $ENV{'CYGROOT'} ? $ENV{'CYGROOT'} : "";

	# This stores all the Aleph settings
	my %settings;
	my %params;
	my $PARALLEL = 0;
	my $ALEPH = (defined $ENV{'ALEPH'} ? $ENV{'ALEPH'} : "");
	my $YAP = (defined $ENV{'YAP'} ? $ENV{'YAP'} : "");
	my $DEBUG = 0;

	GetOptions
	(
	'param:s' => \%params,
	'set:s' => \%settings,
	'parallel' => \$PARALLEL,
	'aleph:s' => \$ALEPH,
	'yap:s' => \$YAP,
	'debug' => \$DEBUG
	);

	# We must know where Aleph and Yap are
	($ALEPH && $YAP) \|\| die "Cannot run crossval without ALEPH and YAP!\n";

	my ($experiment, $background, $data, $settings, $commands) = @ARGV;

	# EXP_BASE, BACKGROUND and DATA_PATTERN are required
	if(! $experiment \|\| ! $background \|\| ! $data) {
	die "USAGE: $SYNOPSIS\n";
	}

	# Print header
	print "--------------------------------------\n";
	print "Experiment started at: ".localtime()."\n";
	print "Yap: $YAP\nAleph: $ALEPH\n";

	# Build a list of data basenames from the pattern plus .f or .n extensions
	$DEBUG && print "DEBUG -- File Base: '$data'.\n";
	my @datafiles = @{getFiles($data.'\.[fn]')};
	$DEBUG && print "DEBUG -- Files:\n".join("\n\t",@datafiles)."\n";

	my @databases;
	foreach my $file (@datafiles) {
	my ($base) = $file =~ /(.*)\.[f]/;
	if(defined($base)) {
	# If the file base has an absolute position, prepend it with the cygwin root.
	if($base =~ /^\//) { $base = $CYGWIN_ROOT.$base; }
	push @databases, $base;
	}
	}

	my @exp_params; # Parameters are settings that are recorded in the filename
	my @exp_settings; # Settings are not recorded in the filename.
	my @exp_commands; # These are the commands to be run after the settings and params.

	# Add settings from settings file. These will override any settings in the
	# $background file.
	if($settings) {
	open(SETTINGS, "<$settings") \|\| die "Could not read settings from '$settings'";
	print "Settings File: '$settings'.\n";
	push @exp_settings, "% Settings: $settings";
	while(<SETTINGS>) {
	chomp;
	$_ && push @exp_settings, $_;
	}
	}

	# This is the same as below but the settings are not saved in the filename.
	foreach my $param (keys %settings) {
	my $paramval = $settings{$param};

	print "Using setting '$param' with value '$paramval'\n";
	push @exp_settings, "set($param,$paramval).";
	}

	# Add parameters given on the command line. These override settings given in the
	# $background file and in the $settings file. Parameters are just settings that
	# are stored in the file name for processing by makeplot.
	my @baseexts = ();
	foreach my $param (keys %params) {
	my $paramval = $params{$param};

	print "Using parameter '$param' with value '$paramval'\n";
	push @exp_settings, "set($param,$paramval).";
	push @baseexts, $param."_".$paramval;
	}

	# Add any parameters to the experiments base name.
	my $expbase = ($#baseexts > -1 ? $experiment."--".join("-", @baseexts)."--" : $experiment);

	# If a commands file was specified, add it to the list of commands, otherwise
	# just add "induce.". Always show the settings at the start of the run once all the
	# settings have been made.
	push @exp_commands, "show(settings).";
	if($commands) {
	open(COMMANDS, "<$commands") \|\| die "Could not open commands file: '$commands'";
	print "Commands File: '$commands'.\n";
	push @exp_commands, "% From commands file: $commands";
	while(<COMMANDS>) {
	chomp;
	$_ && push @exp_commands, $_;
	}
	} else {
	push @exp_commands, "induce.";
	push @exp_commands, "write_rules.";
	}


	# Run all of the cross validations
	print basename($expbase).":\n";
	my %runs = %{buildruns(\@databases)};
	my $runnum = 0;
	foreach my $holdout (keys %runs) {
	$runnum++;
	my $runname = $expbase.$runnum;
	my $outfile = $runname.".out";
	my $rulesfile = $runname.".rules";
	my @run = @{$runs{$holdout}};

	# These need to be run to ensure the enviroment is okay and all the files
	# are loaded in.
	my $pwd = $CYGWIN_ROOT.$ENV{'PWD'};
	my @run_settings =
	(
	"path(_P).", # This is here to get around weird path bug
	"add_to_path('$pwd').", # So we can consult $background
	"consult('$ALEPH').",
	"cd('$pwd').", # So we can read in the example files
	"read_all('$background', [".join(',', map("'$_'", @run))."]).",
	"set(record, true).",
	"set(recordfile, '$outfile').",
	"set(test_pos, '$holdout.f').",
	"set(test_neg, '$holdout.n').",
	"set(experiment, '$runname').",
	"set(rulefile, '$rulesfile')."
	);

	# Write the commands to an .in file in this order:
	# run_settings, exp_settings, exp_commands
	my $infilename = $runname.".in";
	open(COMMANDS, ">$infilename") \|\| die "Could not open '$infilename' for writing!\n";
	print COMMANDS "#!$YAP -L \n";
	print COMMANDS "# BUILT: ".localtime()."\n";
	my @commands = ();
	foreach my $command (@run_settings, @exp_settings, @exp_commands) {
	# Ignore comments
	if (!($command =~ /^%/)) {
	push @commands, ':- '.$command;
	} else {
	push @commands, $command;
	}
	}
	print COMMANDS join("\n", @commands)."\n";
	close(COMMANDS);

	# Delete any old .out files that may be lying around so record doesn't append to it.
	-f $outfile && system("rm $outfile");

	# Run the experiment
	my $time = time();
	print "\tRun ".$runnum."...";

	my $command = ($PARALLEL ? "enqueue " : "").$infilename.' > /dev/null 2>&1';
	system("chmod +x $infilename");
	system($command);

	print $PARALLEL ? "enqueued." : "completed!";
	$time = time() - $time;
	print " (Time: $time s)\n";
	}

	print "Experiment completed at: ".localtime()."\n";

	# getFiles
	# Read in all the files matching the given pattern.
	sub getFiles {
	my ($filepattern) = @_;
	my $dir = dirname($filepattern);
	my $file = basename($filepattern);

	opendir(DIR, $dir) \|\| die "Could not open directory '$dir': $!\n";
	my $pattern = qr/$file/;
	my @files = grep { /^$pattern/ && -f "$dir/$_" } readdir(DIR);
	@files = map { $dir."/".$_ } @files;
	closedir DIR;

	return \@files;
	}

	# buildruns
	# Creates a hash of basenames to lists where the lists are a collection of datasets
	# to be used in a cross validation run. The basename for each list is for file that
	# was held out.
	sub buildruns {
	my ($dbref) = @_;
	my @dbs = @{$dbref};

	my %runs;

	foreach my $holdout (@dbs) {
	# Training files are all those files that don't match the holdout file
	my @currdbs = grep (!/^$holdout$/, @dbs);
	$runs{$holdout} = \@currdbs;
	}

	return \%runs;
	}