Created
March 19, 2013 12:41
-
-
Save marians/5195774 to your computer and use it in GitHub Desktop.
This is an HTTP service for natural language guessing of input texts. Run it as "perl lang_detection_server.pl" and open a URL like http://localhost:8080/?text=This+is+just+a+test+string . TextCat source code courtesy of Gertjan van Noord.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This is a sloppy HTTP server version of TextCat, an n-gram based natural language guesser | |
| # written by Gertjan van Noord in 1997. | |
| # More info: http://odur.let.rug.nl/~vannoord/TextCat/ | |
| # | |
| # TextCat was distributed under the GNU Lesser General Public License. | |
| # | |
| # You need the language model files (LM folder from Gertjan's distribution) in a directory. | |
| # Set the variable $opt_d to point to that directory. | |
| package MyServer; | |
| use base qw(HTTP::Server::Simple::CGI); | |
| use strict; | |
| use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a); | |
| my $non_word_characters='0-9\s'; | |
| $opt_a = 10; | |
| $opt_d = '/path/to/textcat/LM'; | |
| $opt_f = 0; | |
| $opt_t = 400; | |
| $opt_u = 1.05; | |
| # read language models once | |
| opendir DIR, "$opt_d" or die "directory $opt_d: $!\n"; | |
| our @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR)); | |
| closedir DIR; | |
| @languages or die "sorry, can't read any language models from $opt_d\n" . | |
| "language models must reside in files with .lm ending\n"; | |
| sub create_lm { | |
| my $ngram; | |
| ($_,$ngram) = @_; | |
| my $word; | |
| foreach $word (split("[$non_word_characters]+")) { | |
| $word = "_" . $word . "_"; | |
| my $len = length($word); | |
| my $flen=$len; | |
| my $i; | |
| for ($i=0;$i<$flen;$i++) { | |
| $$ngram{substr($word,$i,5)}++ if $len > 4; | |
| $$ngram{substr($word,$i,4)}++ if $len > 3; | |
| $$ngram{substr($word,$i,3)}++ if $len > 2; | |
| $$ngram{substr($word,$i,2)}++ if $len > 1; | |
| $$ngram{substr($word,$i,1)}++; | |
| $len--; | |
| } | |
| } | |
| map { my $key=$_; if ($$ngram{$key} <= $opt_f) | |
| { delete $$ngram{$key}; }; } keys %$ngram; | |
| my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram; | |
| splice(@sorted,$opt_t) if (@sorted > $opt_t); | |
| return @sorted; | |
| } | |
| sub classify { | |
| my ($input) = @_; | |
| my %results = (); | |
| my $maxp = $opt_t; | |
| my @unknown = create_lm($input); | |
| my $language; | |
| foreach $language (@languages) { | |
| # loads the language model into hash %$language. | |
| my %ngram=(); | |
| my $rang=1; | |
| open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n"; | |
| while (<LM>) { | |
| chomp; | |
| if (/^[^$non_word_characters]+/o) { | |
| $ngram{$&} = $rang++; | |
| } | |
| } | |
| close(LM); | |
| # compares the language model with input ngrams list | |
| my ($i,$p)=(0,0); | |
| while ($i < @unknown) { | |
| if ($ngram{$unknown[$i]}) { | |
| $p=$p+abs($ngram{$unknown[$i]}-$i); | |
| } else { | |
| $p = $p + $maxp; | |
| } | |
| ++$i; | |
| } | |
| $results{$language} = $p; | |
| } | |
| my @results = sort { $results{$a} <=> $results{$b} } keys %results; | |
| #print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v; | |
| my $a = $results{$results[0]}; | |
| my @answers=(shift(@results)); | |
| while (@results && $results{$results[0]} < ($opt_u *$a)) { | |
| @answers = (@answers, shift(@results)); | |
| } | |
| if (@answers > $opt_a) { | |
| return ('unknown'); | |
| } else { | |
| return @answers; | |
| } | |
| } | |
| sub handle_request { | |
| my ($self, $cgi) = @_; | |
| print join(',', classify($cgi->param('text'))); | |
| } | |
| MyServer->new->run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment