May 3, 2016 02:06
diff --git a/compare_authors.cgi b/compare_authors.cgi
 #!usr/bin/perl
 BEGIN { use CGI::Carp qw(carpout);  open(LOG, ">>_error2.txt") or  die("Unable to open mycgi-log: $!\n");  carpout(LOG);  }
 BEGIN { $SIG{"__DIE__"}  = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }

 # This program ranks how similar a text basefile author is to target file authors. Meant for English words.
 # Can work on 7KB files (1,000 words) if target files are 50KB. Both at 50KB is darn good. 
 # Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files.
 # Use SVMLink open software ranking capability for professional work.
 # Author: Scott Roberts, 2016.

 ########   TELL PROGRAM WHAT IT NEEDS TO KNOW #########
 $words_to_ignore=''; # for example bitcoin|blockchain|node
 $basefile='satoshi_all.txt'; # unknown author. Stays in directory with this program
 $basesize=-s $basefile; # get size of file in bytes
 $basesize=2000000; # in case you need to make it smaller than above for small target files.
 $oversize=1; # useful if all target files are a lot bigger than unknown author by some factor >1.
 $buffer=1.25; # this pulls in more than needed to make sure enough words are obtained
 $targetfilesdir='books'; # all files > 30% bigger than base file to make sure enough words are retireved.
 ########          PRINT HTML HEADER         #######
 print "Content-type: text/html\n\n<html><H3>Author Comparison</H3>
 Base text: $basefile $basesize bytes. Target texts directory: $targetfilesdir<BR>
 words to ignore: $words_to_ignore<BR>
 using only first $basesize x $oversize bytes of target files<BR><BR>";
 #######       RUN PROGRAM        ######
 open(F,"<$basefile") or die $!; read(F,$c,$basesize); close F; 
 %base_count=get_words($c); # stores count (value) of each word (key).
 chdir "c:\\_all\\programs\\indigo-perl-new\\apache-2.2.11\\cgi-bin\\$targetfilesdir"; 
 @files=glob('*.txt');
 foreach $file (@files) { 
  open(F,"<$file") or die $!; read(F,$c,$basesize*$buffer*$oversize); close F; # 1.3= a buffer 
  %target_count=get_words($c);    get_score();      undef %target_count;    
 }
 ######      FINISHED  ----- PRINT RESULTS     ##########
 print "First $total_words words from base text above and target texts below were compared.<BR><BR>";
 @ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
 foreach $file (@ranked) { $rank++;  print "$rank = " . int($scores{$file}*10/$total_words) . " $file <br>"; }
 exit;
 ########       BEGIN SUBROUTINES        #########
 sub get_words {  $c=$_[0]; 
  if ($words_to_ignore ne '') {$c=~s/$words_to_ignore / /gsi;} 
  $c=~s/\r/\n/gs; $c=~s/[^a-zA-Z ]//gs; # get rid of windows returns and remove all non-alpha and spaces
  $c=~s/\n//gs; $c=~s/ +/ /gs; # get rid of newlines and excess spaces.
  @c=split(" ", $c);    if ($firsttime eq '')  { $total_words=$#c; $firsttime='nope';}
  else { $#c=$total_words*$oversize; }
  undef %count;  foreach $c (@c) { $count{$c}++;}  return %count;  
 }
 sub get_score {
  foreach $word (keys %base_count) { 
    $b=$base_count{$word};
    if ($target_count{$word} < 1 ){ $t=0.5/$total_words/$oversize; }  
    else { $t =$target_count{$word}/$oversize; }
    if ($t > $b)  { $scores{$file}+=($t/$b)**0.6;  } 
    else { $scores{$file}+=($b/$t)**0.6; }
 }}
	#!usr/bin/perl
	BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); }
	BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } }

	# This program ranks how similar a text basefile author is to target file authors. Meant for English words.
	# Can work on 7KB files (1,000 words) if target files are 50KB. Both at 50KB is darn good.
	# Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files.
	# Use SVMLink open software ranking capability for professional work.
	# Author: Scott Roberts, 2016.

	######## TELL PROGRAM WHAT IT NEEDS TO KNOW #########
	$words_to_ignore=''; # for example bitcoin\|blockchain\|node
	$basefile='satoshi_all.txt'; # unknown author. Stays in directory with this program
	$basesize=-s $basefile; # get size of file in bytes
	$basesize=2000000; # in case you need to make it smaller than above for small target files.
	$oversize=1; # useful if all target files are a lot bigger than unknown author by some factor >1.
	$buffer=1.25; # this pulls in more than needed to make sure enough words are obtained
	$targetfilesdir='books'; # all files > 30% bigger than base file to make sure enough words are retireved.
	######## PRINT HTML HEADER #######
	print "Content-type: text/html\n\n<html><H3>Author Comparison</H3>
	Base text: $basefile $basesize bytes. Target texts directory: $targetfilesdir<BR>
	words to ignore: $words_to_ignore<BR>
	using only first $basesize x $oversize bytes of target files<BR><BR>";
	####### RUN PROGRAM ######
	open(F,"<$basefile") or die $!; read(F,$c,$basesize); close F;
	%base_count=get_words($c); # stores count (value) of each word (key).
	chdir "c:\\_all\\programs\\indigo-perl-new\\apache-2.2.11\\cgi-bin\\$targetfilesdir";
	@files=glob('*.txt');
	foreach $file (@files) {
	open(F,"<$file") or die $!; read(F,$c,$basesize$buffer$oversize); close F; # 1.3= a buffer
	%target_count=get_words($c); get_score(); undef %target_count;
	}
	###### FINISHED ----- PRINT RESULTS ##########
	print "First $total_words words from base text above and target texts below were compared.<BR><BR>";
	@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores;
	foreach $file (@ranked) { $rank++; print "$rank = " . int($scores{$file}*10/$total_words) . " $file <br>"; }
	exit;
	######## BEGIN SUBROUTINES #########
	sub get_words { $c=$_[0];
	if ($words_to_ignore ne '') {$c=~s/$words_to_ignore / /gsi;}
	$c=~s/\r/\n/gs; $c=~s/[^a-zA-Z ]//gs; # get rid of windows returns and remove all non-alpha and spaces
	$c=~s/\n//gs; $c=~s/ +/ /gs; # get rid of newlines and excess spaces.
	@c=split(" ", $c); if ($firsttime eq '') { $total_words=$#c; $firsttime='nope';}
	else { $#c=$total_words*$oversize; }
	undef %count; foreach $c (@c) { $count{$c}++;} return %count;
	}
	sub get_score {
	foreach $word (keys %base_count) {
	$b=$base_count{$word};
	if ($target_count{$word} < 1 ){ $t=0.5/$total_words/$oversize; }
	else { $t =$target_count{$word}/$oversize; }
	if ($t > $b) { $scores{$file}+=($t/$b)**0.6; }
	else { $scores{$file}+=($b/$t)**0.6; }
	}}