marians · March 19, 2013 12:41
diff --git a/lang_detection_server.pl b/lang_detection_server.pl
 # This is a sloppy HTTP server version of TextCat, an n-gram based natural language guesser
 # written by Gertjan van Noord in 1997.
 # More info: http://odur.let.rug.nl/~vannoord/TextCat/
 #
 # TextCat was distributed under the GNU Lesser General Public License.
 #
 # You need the language model files (LM folder from Gertjan's distribution) in a directory.
 # Set the variable $opt_d to point to that directory.


 package MyServer;
 use base qw(HTTP::Server::Simple::CGI);
 use strict;
 use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
 my $non_word_characters='0-9\s';

 $opt_a = 10;
 $opt_d = '/path/to/textcat/LM';
 $opt_f = 0;
 $opt_t = 400;
 $opt_u = 1.05;


 # read language models once
 opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
 our @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
 closedir DIR;
 @languages or die "sorry, can't read any language models from $opt_d\n" .
    "language models must reside in files with .lm ending\n";



 sub create_lm {
 	my $ngram;
 	($_,$ngram) = @_;
 	my $word;
 	foreach $word (split("[$non_word_characters]+")) {
 		$word = "_" . $word . "_";
 		my $len = length($word);
 		my $flen=$len;
 		my $i;
 		for ($i=0;$i<$flen;$i++) {
 			$$ngram{substr($word,$i,5)}++ if $len > 4;
 			$$ngram{substr($word,$i,4)}++ if $len > 3;
 			$$ngram{substr($word,$i,3)}++ if $len > 2;
 			$$ngram{substr($word,$i,2)}++ if $len > 1;
 			$$ngram{substr($word,$i,1)}++;
 			$len--;
 		}
 	}
 	map { my $key=$_; if ($$ngram{$key} <= $opt_f) 
 		{ delete $$ngram{$key}; }; } keys %$ngram;
 	my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
 	splice(@sorted,$opt_t) if (@sorted > $opt_t); 
 	return @sorted;
 }


 sub classify {
 	my ($input) = @_;
 	my %results = ();
 	my $maxp = $opt_t;

 	my @unknown = create_lm($input);
 	my $language;
 	foreach $language (@languages) {
 		# loads the language model into hash %$language.
 		my %ngram=();
 		my $rang=1;
 		open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
 		while (<LM>) {
 			chomp;
 			if (/^[^$non_word_characters]+/o) {
 				$ngram{$&} = $rang++;
 			}
 		}
 		close(LM);
 		# compares the language model with input ngrams list
 		my ($i,$p)=(0,0);
 		while ($i < @unknown) {
 			if ($ngram{$unknown[$i]}) {
 				$p=$p+abs($ngram{$unknown[$i]}-$i);
 			} else { 
 				$p = $p + $maxp; 
 			}
 			++$i;
 		}
 		$results{$language} = $p;
 	}
 	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
 	#print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
 	my $a = $results{$results[0]};
 	my @answers=(shift(@results));
 	while (@results && $results{$results[0]} < ($opt_u *$a)) {
 		@answers = (@answers, shift(@results));
 	}
 	if (@answers > $opt_a) {
 		return ('unknown');
 	} else {
 		return @answers;
 	}
 }

 sub handle_request {
 	my ($self, $cgi) = @_;
 	print join(',', classify($cgi->param('text')));
 }



 MyServer->new->run();
	# This is a sloppy HTTP server version of TextCat, an n-gram based natural language guesser
	# written by Gertjan van Noord in 1997.
	# More info: http://odur.let.rug.nl/~vannoord/TextCat/
	#
	# TextCat was distributed under the GNU Lesser General Public License.
	#
	# You need the language model files (LM folder from Gertjan's distribution) in a directory.
	# Set the variable $opt_d to point to that directory.


	package MyServer;
	use base qw(HTTP::Server::Simple::CGI);
	use strict;
	use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
	my $non_word_characters='0-9\s';

	$opt_a = 10;
	$opt_d = '/path/to/textcat/LM';
	$opt_f = 0;
	$opt_t = 400;
	$opt_u = 1.05;


	# read language models once
	opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
	our @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
	closedir DIR;
	@languages or die "sorry, can't read any language models from $opt_d\n" .
	"language models must reside in files with .lm ending\n";



	sub create_lm {
	my $ngram;
	($_,$ngram) = @_;
	my $word;
	foreach $word (split("[$non_word_characters]+")) {
	$word = "_" . $word . "_";
	my $len = length($word);
	my $flen=$len;
	my $i;
	for ($i=0;$i<$flen;$i++) {
	$$ngram{substr($word,$i,5)}++ if $len > 4;
	$$ngram{substr($word,$i,4)}++ if $len > 3;
	$$ngram{substr($word,$i,3)}++ if $len > 2;
	$$ngram{substr($word,$i,2)}++ if $len > 1;
	$$ngram{substr($word,$i,1)}++;
	$len--;
	}
	}
	map { my $key=$_; if ($$ngram{$key} <= $opt_f)
	{ delete $$ngram{$key}; }; } keys %$ngram;
	my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
	splice(@sorted,$opt_t) if (@sorted > $opt_t);
	return @sorted;
	}


	sub classify {
	my ($input) = @_;
	my %results = ();
	my $maxp = $opt_t;

	my @unknown = create_lm($input);
	my $language;
	foreach $language (@languages) {
	# loads the language model into hash %$language.
	my %ngram=();
	my $rang=1;
	open(LM,"$opt_d/$language.lm") \|\| die "cannot open $language.lm: $!\n";
	while (<LM>) {
	chomp;
	if (/^[^$non_word_characters]+/o) {
	$ngram{$&} = $rang++;
	}
	}
	close(LM);
	# compares the language model with input ngrams list
	my ($i,$p)=(0,0);
	while ($i < @unknown) {
	if ($ngram{$unknown[$i]}) {
	$p=$p+abs($ngram{$unknown[$i]}-$i);
	} else {
	$p = $p + $maxp;
	}
	++$i;
	}
	$results{$language} = $p;
	}
	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
	#print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
	my $a = $results{$results[0]};
	my @answers=(shift(@results));
	while (@results && $results{$results[0]} < ($opt_u *$a)) {
	@answers = (@answers, shift(@results));
	}
	if (@answers > $opt_a) {
	return ('unknown');
	} else {
	return @answers;
	}
	}

	sub handle_request {
	my ($self, $cgi) = @_;
	print join(',', classify($cgi->param('text')));
	}



	MyServer->new->run();
No results found