|
#!usr/bin/perl |
|
BEGIN { use CGI::Carp qw(carpout); open(LOG, ">>_error2.txt") or die("Unable to open mycgi-log: $!\n"); carpout(LOG); } |
|
BEGIN { $SIG{"__DIE__"} = $SIG{"__WARN__"} = sub { my $error = shift; chomp $error; $error =~ s/[<&>]/"&#".ord($&).";"/ge; print "Content-type: text/html\n\n$error\n"; exit 0; } } |
|
|
|
# This program ranks how similar a text basefile author is to target file authors. Meant for English words. |
|
# Can work on 7KB files (1,000 words) if target files are 50KB. Both at 50KB is darn good. |
|
# Accuracy, approx: 50% in 1st place given 20 authors in same genre with 50k files. |
|
# Use SVMLink open software ranking capability for professional work. |
|
# Author: Scott Roberts, 2016. |
|
|
|
######## TELL PROGRAM WHAT IT NEEDS TO KNOW ######### |
|
$words_to_ignore=''; # for example bitcoin|blockchain|node |
|
$basefile='satoshi_all.txt'; # unknown author. Stays in directory with this program |
|
$basesize=-s $basefile; # get size of file in bytes |
|
$basesize=2000000; # in case you need to make it smaller than above for small target files. |
|
$oversize=1; # useful if all target files are a lot bigger than unknown author by some factor >1. |
|
$buffer=1.25; # this pulls in more than needed to make sure enough words are obtained |
|
$targetfilesdir='books'; # all files > 30% bigger than base file to make sure enough words are retireved. |
|
######## PRINT HTML HEADER ####### |
|
print "Content-type: text/html\n\n<html><H3>Author Comparison</H3> |
|
Base text: $basefile $basesize bytes. Target texts directory: $targetfilesdir<BR> |
|
words to ignore: $words_to_ignore<BR> |
|
using only first $basesize x $oversize bytes of target files<BR><BR>"; |
|
####### RUN PROGRAM ###### |
|
open(F,"<$basefile") or die $!; read(F,$c,$basesize); close F; |
|
%base_count=get_words($c); # stores count (value) of each word (key). |
|
chdir "c:\\_all\\programs\\indigo-perl-new\\apache-2.2.11\\cgi-bin\\$targetfilesdir"; |
|
@files=glob('*.txt'); |
|
foreach $file (@files) { |
|
open(F,"<$file") or die $!; read(F,$c,$basesize*$buffer*$oversize); close F; # 1.3= a buffer |
|
%target_count=get_words($c); get_score(); undef %target_count; |
|
} |
|
###### FINISHED ----- PRINT RESULTS ########## |
|
print "First $total_words words from base text above and target texts below were compared.<BR><BR>"; |
|
@ranked = sort {$scores{$a} <=> $scores{$b} } keys %scores; |
|
foreach $file (@ranked) { $rank++; print "$rank = " . int($scores{$file}*10/$total_words) . " $file <br>"; } |
|
exit; |
|
######## BEGIN SUBROUTINES ######### |
|
sub get_words { $c=$_[0]; |
|
if ($words_to_ignore ne '') {$c=~s/$words_to_ignore / /gsi;} |
|
$c=~s/\r/\n/gs; $c=~s/[^a-zA-Z ]//gs; # get rid of windows returns and remove all non-alpha and spaces |
|
$c=~s/\n//gs; $c=~s/ +/ /gs; # get rid of newlines and excess spaces. |
|
@c=split(" ", $c); if ($firsttime eq '') { $total_words=$#c; $firsttime='nope';} |
|
else { $#c=$total_words*$oversize; } |
|
undef %count; foreach $c (@c) { $count{$c}++;} return %count; |
|
} |
|
sub get_score { |
|
foreach $word (keys %base_count) { |
|
$b=$base_count{$word}; |
|
if ($target_count{$word} < 1 ){ $t=0.5/$total_words/$oversize; } |
|
else { $t =$target_count{$word}/$oversize; } |
|
if ($t > $b) { $scores{$file}+=($t/$b)**0.6; } |
|
else { $scores{$file}+=($b/$t)**0.6; } |
|
}} |