lucarin91 · June 26, 2016 18:15
diff --git a/preprocessing.pl b/preprocessing.pl
 #!/usr/bin/perl
 use File::Basename;

 # STOP WORDS FOLDER
 my $stp = "/home/andrea/Downloads/DataMining/code/data/";
 # STOP WORDS FILE
 my $stf = "stopwords.txt";

 # DATA MAIN PATH
 my $dp  = "/home/andrea/Downloads/DataMining/code/dwld/Gutenberg/dwdl/stripped/";
 # SOURCE SUB-DIR
 my $sf  = "noHeaders/";
 # DEST SUB-DIR
 my $ef  = "ready/";

 foreach (<"$dp*">) {
  $lang = basename($_).'/';

  # OPEN STOP WORDS FILE 4 EACH LANGUAGE
  open my $stop, "<", "$stp$lang$stf" or print "Can't read tmp file $stp$lang$stf: $!";
  my @stopWords = <$stop>;
  close ($stop);

  # CLEAN STOPWORDS IF NEEDED
  chomp @stopWords;
  my $stopRegex = join '|', @stopWords;
  print $stopRegex, "\n";

  foreach (<"$_/$sf*">) {
    my $source = "$_";
    my $dest = dirname(dirname($_))."/$ef".basename($_);
    print $source, "\n";
    print $dest, "\n";

    open my $in, $source or die " could not open file $source: $! \n";
    open my $out, ">", $dest or die " could not open file $dest: $! \n";

    #FILE PROCESSING
    while (<$in>) {
        # REMOVES PUNCTUATION AND STOPWORDS
        s/\t|[[:punct:]]|\b(?:$stopRegex)\b//ig;
        # REDUCES REDUNDANT SPACES                  #s/(\R)(?:\h*\R)+/$1$1/g;
        s/( )+/ /g;
        # IGNORE BLANK LINES AND WRITE IN LOWERCASE TO FILE
        print $out lc $_ if (!/^$/) && (!/^\s*$/);
    }
    close($in);
    close($out);
  }
 }
	#!/usr/bin/perl
	use File::Basename;

	# STOP WORDS FOLDER
	my $stp = "/home/andrea/Downloads/DataMining/code/data/";
	# STOP WORDS FILE
	my $stf = "stopwords.txt";

	# DATA MAIN PATH
	my $dp = "/home/andrea/Downloads/DataMining/code/dwld/Gutenberg/dwdl/stripped/";
	# SOURCE SUB-DIR
	my $sf = "noHeaders/";
	# DEST SUB-DIR
	my $ef = "ready/";

	foreach (<"$dp*">) {
	$lang = basename($_).'/';

	# OPEN STOP WORDS FILE 4 EACH LANGUAGE
	open my $stop, "<", "$stp$lang$stf" or print "Can't read tmp file $stp$lang$stf: $!";
	my @stopWords = <$stop>;
	close ($stop);

	# CLEAN STOPWORDS IF NEEDED
	chomp @stopWords;
	my $stopRegex = join '\|', @stopWords;
	print $stopRegex, "\n";

	foreach (<"$_/$sf*">) {
	my $source = "$_";
	my $dest = dirname(dirname($_))."/$ef".basename($_);
	print $source, "\n";
	print $dest, "\n";

	open my $in, $source or die " could not open file $source: $! \n";
	open my $out, ">", $dest or die " could not open file $dest: $! \n";

	#FILE PROCESSING
	while (<$in>) {
	# REMOVES PUNCTUATION AND STOPWORDS
	s/\t\|[[:punct:]]\|\b(?:$stopRegex)\b//ig;
	# REDUCES REDUNDANT SPACES #s/(\R)(?:\h*\R)+/$1$1/g;
	s/( )+/ /g;
	# IGNORE BLANK LINES AND WRITE IN LOWERCASE TO FILE
	print $out lc $_ if (!/^$/) && (!/^\s*$/);
	}
	close($in);
	close($out);
	}
	}
No results found