Created
December 8, 2011 12:38
-
-
Save ksurent/1446877 to your computer and use it in GitHub Desktop.
(Buggy) tokenizer in Qore
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env qore | |
%new-style | |
%require-types | |
%enable-all-warnings | |
namespace OpenCorpora; | |
class OpenCorpora::Tokenizer { | |
private { | |
list bounds; | |
list tokens; | |
hash vectors_cache; | |
OpenCorpora::Vectors vectors; | |
OpenCorpora::List prefixes; | |
OpenCorpora::List exceptions; | |
OpenCorpora::List hyphens; | |
} | |
public { | |
const VERSION = '0.01'; | |
} | |
constructor(hash args = hash()) { | |
vectors = new OpenCorpora::Vectors(args.vectors); | |
exceptions = new OpenCorpora::List(args.exceptions); | |
prefixes = new OpenCorpora::List(args.prefixes); | |
hyphens = new OpenCorpora::List(args.hyphens); | |
} | |
list tokens_bounds(string text) { | |
do_tokenize(text); | |
return bounds; | |
} | |
list tokens(string text, hash options = hash()) { | |
options.want_tokens = True; | |
options.threshold = 0.878; # yes, i know | |
do_tokenize(text, options); | |
return tokens; | |
} | |
private do_tokenize(string text, hash opts = hash()) { | |
string token = ''; | |
for(int i = 0; i <= length(text) - 1; i++) { | |
hash ctx = ( | |
'pos' : i, | |
'prev' : text[i-1], | |
'next' : text[i+1], | |
'nnext': text[i+2], | |
'char' : text[i], | |
'text' : text, | |
); | |
if(ctx.prev == NOTHING) { ctx.prev = ''; } | |
if(ctx.next == NOTHING) { ctx.next = ''; } | |
if(ctx.nnext == NOTHING) { ctx.nnext = ''; } | |
get_sequences(\ctx); | |
vectorize(\ctx); | |
token += ctx.char; | |
*float p = vectors.probability(ctx.vector); | |
if(p == NOTHING) p = 0.5; | |
if(opts.want_tokens) { | |
if( | |
p >= opts.threshold | |
|| ctx.pos == length(ctx.text) - 1 | |
) | |
{ | |
push tokens, trim(token); | |
token = ''; | |
} | |
} | |
else { | |
if(p) push bounds, (ctx.pos, p); | |
} | |
} | |
} | |
private get_sequences(reference ctx) { | |
string seq = ''; | |
string seq_left = ''; | |
string seq_right = ''; | |
string spacer = ''; | |
foreach string candiate in (list(ctx.next, ctx.char)) { | |
*list found = regex_extract(candiate, '([-./?=:&"!+()])'); | |
if(elements found) { | |
spacer = found[0]; | |
break; | |
} | |
} | |
if(length(spacer)) { | |
for(int i = ctx.pos; i >= 0; i--) { | |
string ch = ctx.text[i]; | |
bool case1 = is_hyphen(spacer) && (is_cyr(ch) || is_hyphen(ch) || is_single_quote(ch)); | |
bool case2 = !is_hyphen(spacer) && !is_space(ch); | |
if(case1 || case2) { | |
seq_left = ch + seq_left; | |
} | |
else { | |
break; | |
} | |
if(substr(seq_left, -1) === spacer) { | |
seq_left = substr(seq_left, 0, -1); | |
} | |
} | |
for(int i = ctx.pos + 1; i < length(ctx.text); i++) { | |
string ch = ctx.text[i]; | |
bool case1 = is_hyphen(spacer) && (is_cyr(ch) || is_hyphen(ch) || is_single_quote(ch)); | |
bool case2 = !is_hyphen(spacer) && !is_space(ch); | |
if(case1 || case2) seq_right += ch; | |
else break; | |
if(substr(seq_right, -1) === spacer) | |
seq_right = substr(seq_right, 0, 1); | |
} | |
seq = join('', seq_left, seq, seq_right); | |
} | |
ctx.spacer = spacer; | |
ctx.seq = seq; | |
ctx.seq_right = seq_right; | |
ctx.seq_left = seq_left; | |
} | |
private vectorize(reference ctx) { | |
#string ckey = join(',', is_hyphen(ctx.spacer), | |
# ctx.spacer, | |
# ctx.prev, | |
# ctx.char, | |
# ctx.next, | |
# ctx.nnext, | |
# ctx.seq_left, | |
# ctx.seq, | |
# ctx.seq_right); | |
ctx.vector = do_vectorize(ctx); | |
#if(!exists vectors_cache{ckey}) | |
# vectors_cache{ckey} = do_vectorize(ctx); | |
#ctx.vector = vectors_cache{ckey}; | |
} | |
private int do_vectorize(reference ctx) { | |
bool spacer = boolean(length(ctx.spacer)); | |
bool spacer_is_hyphen = boolean(is_hyphen(ctx.spacer)); | |
list bits = ( | |
char_class(ctx.char), | |
char_class(ctx.next), | |
is_digit(ctx.prev), | |
is_digit(ctx.nnext), | |
spacer_is_hyphen | |
? is_dict_seq(ctx.seq) | |
: 0, | |
spacer_is_hyphen | |
? is_suffix(ctx.seq_right) | |
: 0, | |
is_same_pm(ctx.char, ctx.next), | |
(spacer && !spacer_is_hyphen) | |
? looks_like_url(ctx.seq, ctx.seq_right) | |
: 0, | |
(spacer && !spacer_is_hyphen) | |
? is_exception_seq(ctx.seq) | |
: 0, | |
spacer_is_hyphen | |
? is_prefix(ctx.seq_left) | |
: 0, | |
(is_colon(ctx.spacer) && length(ctx.seq_right)) | |
? looks_like_time(ctx.seq_left, ctx.seq_right) | |
: 0, | |
); | |
return strtoint(join('', bits), 2); | |
} | |
private int is_pmark(string ch) { return ch =~ /^[,?!";«»]$/ ? 1 : 0; } | |
private int is_latin(string ch) { return ch =~ /^[a-zA-Z]$/ ? 1 : 0; } | |
private int is_cyr(string ch) { return ch =~ /^[а-яёА-ЯЁ]$/ ? 1 : 0; } | |
private int is_digit(string ch) { return ch =~ /^[0-9]$/ ? 1 : 0; } | |
private int is_bracket1(string ch) { return ch =~ /^[\[({<]$/ ? 1 : 0; } | |
private int is_bracket2(string ch) { return ch =~ /^[\])}>]$/ ? 1 : 0; } | |
private int is_suffix(string seq) { return seq =~ /^(?:то|таки|с|ка|де)$/ ? 1 : 0; } | |
private int is_space(string ch) { return ch === ' ' ? 1 : 0; } | |
private int is_hyphen(string ch) { return ch === '-' ? 1 : 0; } | |
private int is_dot(string ch) { return ch === '.' ? 1 : 0; } | |
private int is_single_quote(string ch) { return ch === "'" ? 1 : 0; } | |
private int is_slash(string ch) { return ch === '/' ? 1 : 0; } | |
private int is_colon(string ch) { return ch === ':' ? 1 : 0; } | |
private int is_same_pm(string ch1, string ch2) { return int(ch1 === ch2); } | |
private int is_prefix(string seq) { return prefixes.in_list(tolower(seq)) ? 1 : 0; } | |
private int is_dict_seq(string seq) { | |
if(!length(seq) || seq[0] === '-') return 0; | |
return hyphens.in_list(seq) ? 1 : 0; | |
} | |
private int is_exception_seq(string seq) { | |
if(exceptions.in_list(seq)) return 1; | |
if(seq !~ /^\W|\W$/) return 0; | |
string pattern = '^[^A-Za-zА-ЯЁа-яё0-9]+'; | |
seq = regex_subst(seq, pattern, ''); | |
if(exceptions.in_list(seq)) return 1; | |
while(regex(seq, pattern)) { | |
seq = regex_subst(seq, pattern, ''); | |
if(exceptions.in_list(seq)) return 1; | |
} | |
return 0; | |
} | |
private int looks_like_url(string seq, string seq_right) { | |
if(!length(seq_right)) return 0; | |
if(length(seq) < 5) return 0; | |
if(seq[0] === '.') return 0; | |
if( | |
seq =~ /^\W*https?:\/\// | |
|| seq =~ /^\W*www\./ | |
|| seq =~ /.\.(?:[a-z]{2,3}|р[уф])\W*$/i | |
) | |
return 1; | |
return 0; | |
} | |
private int looks_like_time(string seq_left, string seq_right) { | |
if(seq_left !~ /^[0-9]{1,2}$/ || seq_right !~ /^[0-9]{2}$/) | |
return 0; | |
return (int(seq_left) < 24 && int(seq_right) < 60) ? 1 : 0; | |
} | |
private string char_class(string ch) { | |
return is_cyr(ch) ? '0001' : | |
is_space(ch) ? '0010' : | |
is_dot(ch) ? '0011' : | |
is_pmark(ch) ? '0100' : | |
is_hyphen(ch) ? '0101' : | |
is_digit(ch) ? '0110' : | |
is_latin(ch) ? '0111' : | |
is_bracket1(ch) ? '1000' : | |
is_bracket2(ch) ? '1001' : | |
is_single_quote(ch) ? '1010' : | |
is_slash(ch) ? '1011' : | |
is_colon(ch) ? '1100' : '0000'; | |
} | |
} | |
class OpenCorpora::List { | |
private { | |
list list; | |
} | |
constructor(string fn) { | |
load(fn); | |
} | |
private load(string fn) { | |
File fh = new File(); | |
try { | |
fh.open2(fn); | |
} | |
catch(e) { | |
print(e.desc); | |
exit(1); | |
} | |
string raw = gunzip_to_string(fh.readBinary(fh.stat()[7])); | |
list = split("\n", raw); | |
fh.close(); | |
} | |
bool in_list(string str) { | |
list selected = select list, $1 === str; | |
return boolean(elements selected); | |
} | |
} | |
class OpenCorpora::Vectors { | |
private { | |
hash vectors; | |
} | |
constructor(string fn) { | |
load(fn); | |
} | |
private load(string fn) { | |
File fh = new File(); | |
try { | |
fh.open2(fn); | |
} | |
catch(e) { | |
print(e.desc); | |
exit(1); | |
} | |
string raw = gunzip_to_string(fh.readBinary(fh.stat()[7])); | |
foreach string row in (split("\n", raw)) { | |
list vp = split(' ', row); | |
vectors{vp[0]} = float(vp[1]); | |
} | |
fh.close(); | |
} | |
*float probability(int vector) { | |
return vectors{vector}; | |
} | |
} | |
string path = '/home/ksurent/Lingua--RU--OpenCorpora--Tokenizer/blib/lib/auto/share/dist/Lingua-RU-OpenCorpora-Tokenizer'; | |
hash files = ( | |
'vectors' : path + '/vectors.gz', | |
'prefixes' : path + '/prefixes.gz', | |
'hyphens' : path + '/hyphens.gz', | |
'exceptions': path + '/exceptions.gz', | |
); | |
Tokenizer tok = new Tokenizer(files); | |
#printf("%N\n", tok.tokens("Он хотел было уйти, но не тут-то было: дверь за его спиной уже закрылась.")); | |
string separator = 'º'; | |
Datasource dbh = new Datasource(SQL::DSMySQL, 'corpora', 'corpora', 'corpora', 'utf8', '127.0.0.1', 3306); | |
list sentences = dbh.selectRows(sprintf(" | |
select | |
source, | |
group_concat(tf_text order by text_forms.pos separator '%s') as separated | |
from | |
sentences | |
join | |
text_forms | |
using | |
(sent_id) | |
group by | |
source | |
", separator)); | |
int correct = 0; | |
int total = 0; | |
foreach hash sentence in (sentences) { | |
list tokens = tok.tokens(sentence.source); | |
total++; | |
if(join(separator, tokens) === sentence.separated) | |
correct++; | |
} | |
printf("%d/%d %.2f%%\n", correct, total, correct / total * 100); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment