Skip to content

Instantly share code, notes, and snippets.

@marians
Created March 19, 2013 12:41
Show Gist options
  • Select an option

  • Save marians/5195774 to your computer and use it in GitHub Desktop.

Select an option

Save marians/5195774 to your computer and use it in GitHub Desktop.
This is an HTTP service for natural language guessing of input texts. Run it as "perl lang_detection_server.pl" and open a URL like http://localhost:8080/?text=This+is+just+a+test+string . TextCat source code courtesy of Gertjan van Noord.
# This is a sloppy HTTP server version of TextCat, an n-gram based natural language guesser
# written by Gertjan van Noord in 1997.
# More info: http://odur.let.rug.nl/~vannoord/TextCat/
#
# TextCat was distributed under the GNU Lesser General Public License.
#
# You need the language model files (LM folder from Gertjan's distribution) in a directory.
# Set the variable $opt_d to point to that directory.
package MyServer;
use base qw(HTTP::Server::Simple::CGI);
use strict;
use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
my $non_word_characters='0-9\s';
$opt_a = 10;
$opt_d = '/path/to/textcat/LM';
$opt_f = 0;
$opt_t = 400;
$opt_u = 1.05;
# read language models once
opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
our @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
closedir DIR;
@languages or die "sorry, can't read any language models from $opt_d\n" .
"language models must reside in files with .lm ending\n";
sub create_lm {
my $ngram;
($_,$ngram) = @_;
my $word;
foreach $word (split("[$non_word_characters]+")) {
$word = "_" . $word . "_";
my $len = length($word);
my $flen=$len;
my $i;
for ($i=0;$i<$flen;$i++) {
$$ngram{substr($word,$i,5)}++ if $len > 4;
$$ngram{substr($word,$i,4)}++ if $len > 3;
$$ngram{substr($word,$i,3)}++ if $len > 2;
$$ngram{substr($word,$i,2)}++ if $len > 1;
$$ngram{substr($word,$i,1)}++;
$len--;
}
}
map { my $key=$_; if ($$ngram{$key} <= $opt_f)
{ delete $$ngram{$key}; }; } keys %$ngram;
my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
splice(@sorted,$opt_t) if (@sorted > $opt_t);
return @sorted;
}
sub classify {
my ($input) = @_;
my %results = ();
my $maxp = $opt_t;
my @unknown = create_lm($input);
my $language;
foreach $language (@languages) {
# loads the language model into hash %$language.
my %ngram=();
my $rang=1;
open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
while (<LM>) {
chomp;
if (/^[^$non_word_characters]+/o) {
$ngram{$&} = $rang++;
}
}
close(LM);
# compares the language model with input ngrams list
my ($i,$p)=(0,0);
while ($i < @unknown) {
if ($ngram{$unknown[$i]}) {
$p=$p+abs($ngram{$unknown[$i]}-$i);
} else {
$p = $p + $maxp;
}
++$i;
}
$results{$language} = $p;
}
my @results = sort { $results{$a} <=> $results{$b} } keys %results;
#print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
my $a = $results{$results[0]};
my @answers=(shift(@results));
while (@results && $results{$results[0]} < ($opt_u *$a)) {
@answers = (@answers, shift(@results));
}
if (@answers > $opt_a) {
return ('unknown');
} else {
return @answers;
}
}
sub handle_request {
my ($self, $cgi) = @_;
print join(',', classify($cgi->param('text')));
}
MyServer->new->run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment