Last active
January 17, 2021 20:00
-
-
Save dingsdax/1339123 to your computer and use it in GitHub Desktop.
n-grams/tf-idf indexer written in ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#adds helper methods to standard ruby classes | |
class String | |
#make a string stemmable | |
include Stemmable | |
#get array of n-grams of string | |
def ngrams(len = 1) | |
ngrams = [] | |
len = size if len > size | |
(0..size - len).each do |n| | |
ng = self[n...(n + len)] | |
ngrams.push(ng) | |
end | |
ngrams | |
end | |
#get last character of string | |
def last | |
self[size-1,1] | |
end | |
end | |
class Array | |
#return hash with frequencies of items in an array | |
def freqs | |
inject(Hash.new(0)){ |hash,x| | |
hash[x] +=1 | |
hash | |
} | |
end | |
#return hash with array elems as keys, values=1 | |
def exists | |
inject(Hash.new(0)){ |hash,x| | |
hash[x] =1 | |
hash | |
} | |
end | |
#sum up array items | |
def sum | |
inject(nil) { |sum,x| sum ? sum+x : x } | |
end | |
end | |
class Hash | |
#sort hash by values descending | |
def sort_num_value | |
to_a.sort_by {|key,value| -value} | |
end | |
#convert all hash values to arrays | |
def mk_ary_val | |
self.each_pair do |k,v| | |
self[k] = [v] | |
self | |
end | |
end | |
end | |
class Float | |
#round float to decimal places | |
alias_method :round_orig, :round | |
def round(n=0) | |
(self * (10.0 ** n)).round_orig * (10.0 ** (-n)) | |
end | |
end | |
class Dir | |
#recurse directory, return array of files,dirs | |
def self.recurse(path='.', ext='*', &block) | |
list = [] | |
stoplist = ['.', '..'] | |
Dir.foreach(path) do |f| | |
next if stoplist.include?(f) | |
filename = (path == '.' ? f : path + '/' + f) | |
next if f.match(/^\./) | |
list << filename | |
block.call(filename) if block | |
if FileTest.directory?(filename) | |
list.concat( Dir.recurse(filename, &block) ) | |
end | |
end | |
list | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'fileutils' | |
require 'iconv' | |
require 'stemmer.rb' | |
require 'helpers.rb' | |
class Indexer | |
#constant for stop words | |
STOP_WORDS = [] | |
def initialize | |
#data structures | |
@ngrams_doc = Hash.new #storing corpus ngrams + their doc-freq | |
@docs = 0 #nr of docs processed | |
# options | |
@ngrams = 3 #nr of ngrams | |
@stemming = true #do porter stemming on words | |
@stopwording = true #remove stop words | |
@upperbound = 1 #ngram must be in at most 90% of docs | |
@lowerbound = 0 #ngram must be in at least 20% of docs | |
@round = 6 #number of decimal places for tf-idf | |
@name = 'indexer' #give the index a name | |
# directories & files | |
@wdir = Dir.getwd #working directory | |
@stw_file = 'stop_words_en.txt' #english stop words file, default | |
@tmp_dir = Dir.getwd + '/tmp' #dir for tmp data | |
@out_file = Dir.getwd + '/output.arff' #dir for tmp data | |
@ngrams_file = @tmp_dir + '/ngrams.dtf' # data store corpus ngrams + df | |
end | |
attr_accessor :ngrams, :stemming, :stopwording, :upperbound, :lowerbound, :name, :stw_file, :out_file, :ngrams_file, :wdir | |
#import stop words from file | |
def get_stop_words | |
f = File.open @stw_file | |
f.each_line do |l| | |
STOP_WORDS << l.strip | |
end | |
end | |
#remove stop-words with regexp from string, downcased automatically | |
def rm_stop_words str | |
@stopwording ? str.downcase.gsub(/(#{STOP_WORDS.join('|')})/, '') : str | |
end | |
#do some porter stemming | |
def stem str | |
if @stemming | |
out = String.new | |
str.split(' ').each { |s| out << s.stem << ' ' } | |
out[0..out.size-2] | |
else | |
str | |
end | |
end | |
#process a file, iterate through file, get ngrams + freq | |
def process_file filename | |
puts "processing: #{filename}" | |
# get class assignment | |
doc_class = get_class filename | |
puts "class assignment: #{doc_class}" | |
ngrams_cur = Hash.new #storing current doc ngrams + freq | |
doc_cur = Hash.new #storing doc-freq of ngram, always 1 | |
file_str = String.new #for storing file contents | |
#open file and put whole file into a string | |
file_str = IO.read(filename) | |
# fix encoding errors | |
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8') | |
file_str = ic.iconv(file_str) | |
#trim whitspaces: multiple, trailing, leading | |
file_str = file_str.downcase.gsub(/\s/, " ").gsub("'",'"').squeeze(" ").strip | |
#get ngrams + frequencies | |
# replace single quotes with double quotes, escaping can be troublesome for some ngrams | |
ngrams_cur = file_str.ngrams(@ngrams).freqs | |
#get ngram existance | |
doc_cur = file_str.ngrams(@ngrams).exists | |
#merge results for global ngrams & doc-freq | |
@ngrams_doc = @ngrams_doc.merge(doc_cur) do |key,val_old,val_new| | |
val_old + val_new | |
end | |
#store ngrams of cur doc | |
fn = File.basename(filename) | |
c_file = @tmp_dir + '/' + doc_class + '_' + fn | |
File.open(c_file, 'w+') do |f| | |
Marshal.dump(ngrams_cur, f) | |
end | |
# increase nr of docs processed | |
@docs += 1 | |
end | |
# need to be overwritten | |
def get_class f | |
return 'braaack' | |
end | |
def process_dir | |
#create tmp directory | |
FileUtils.mkdir_p @tmp_dir | |
# document class | |
doc_class = String.new | |
#change working dir & traverse through dir | |
Dir.chdir(@wdir) | |
Dir.recurse(@wdir) do |f| | |
if !File.directory?(f) | |
process_file(f) | |
end | |
end | |
#cut ngrams to lower and upper bounds | |
a = @ngrams_doc.select { |k,v| | |
v.to_f/@docs > @lowerbound && v.to_f/@docs < @upperbound | |
} | |
@ngrams_doc = Hash[*a.flatten] | |
#store all ngrams of corpus | |
File.open(@ngrams_file, 'w') do |f| | |
f.write Marshal.dump(@ngrams_doc.to_a) | |
end | |
puts "file #{@ngrams_file} written" | |
end | |
#delete tmp directory and files | |
def cleanup | |
if File.directory? @tmp_dir | |
FileUtils.rm_r @tmp_dir | |
end | |
end | |
#build the output arff file | |
def build_arff | |
ngrams = Array.new | |
attributes = [['filename','STRING'],['klass','STRING']] | |
File.open(@ngrams_file) do |f| | |
ngrams = Marshal.load(f) | |
end | |
ngrams.each_with_index do |ngram, i| | |
attributes << [ngram[0], 'NUMERIC'] | |
end | |
File.open(@out_file, 'w') do |out| | |
#start output | |
r = '@RELATION ' + @name | |
out.puts r | |
out.puts '' | |
#write attributes, escape ' | |
attributes.each do |a| | |
#o = '@ATTRIBUTE \'' + a[0].to_s.gsub("\\","\\\\'").gsub("\'","\\\'") + '\' ' + a[1].to_s | |
o = '@ATTRIBUTE \'' + a[0].to_s + '\' ' + a[1].to_s | |
out.puts o | |
end | |
#start data section | |
out.puts '' | |
out.puts '@DATA' | |
#enter tmp dir and get all tf files | |
Dir.chdir @tmp_dir | |
Dir.glob("*.*").each do |f| | |
# lignore ngrams file | |
if f == @ngrams_file.split('/').last | |
next | |
end | |
#add filename and class name to instance | |
instance = ['0 ' + f.split('_')[1],'1 ' + f.split('_')[0]] | |
ngrams_cur = Hash.new | |
File.open(f) do |c| | |
ngrams_cur = Marshal.load(c) | |
end | |
#sum of all tf in document | |
dtf = ngrams_cur.values.sum | |
#for each ngram to appear in output | |
ngrams.each_with_index do |ngram, i| | |
if ngrams_cur.has_key? ngram[0] | |
#calculate tf-idf | |
x = ((ngrams_cur[ngram[0]].to_f/dtf)*(Math.log(@docs/ngram[1].to_f))).round(@round) | |
instance << (i+2).to_s + ' ' + x.to_s | |
end | |
end | |
# sort instances for sparse output | |
instance.sort {|x,y| x.to_i <=> y.to_i } | |
#write data section | |
out << '{' | |
instance.each_with_index do |e,i| | |
if (i == (instance.size-1)) | |
e = e.to_s + '}' | |
out.puts e | |
else | |
out << e.to_s + ', ' | |
end | |
end | |
end | |
end | |
puts "file: #{@out_file} written" | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require './lib/indexer.rb' | |
# rake task to build index | |
task :buildindex do | |
# mixin to overwrite get_class method | |
Indexer.class_eval do | |
def get_class f | |
f.split('/')[-2]# get class assignment implicitly with folder structure (first hierarchy level) | |
end | |
end | |
# create Indexer | |
i = Indexer.new | |
# set options | |
i.wdir = Dir.getwd + '/corpora/my_text_corpus' # working directory, root directory of corpus | |
i.ngrams = 4 # n in ngrams | |
i.stemming = true # use stemming? | |
i.stopwording = true # use stopwords? | |
i.upperbound = 0.4 # upper percentage of docs in which a certain ngram has to appear | |
i.lowerbound = 0.01 # lower percentage of docs in which a certain ngram has to appear | |
i.name = 'output' # name for the relation in the output file | |
i.stw_file = 'stop_words_en.txt' # file containing stopwords | |
i.out_file = Dir.getwd + '/output.arff' # name of index file | |
i.process_dir # create temporary data files containing n-grams, frequencies | |
i.build_arff # build the output arff file | |
i.cleanup # delete temporary files | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ruby porter stemmer by ray pareda, additions dingsdax | |
module Stemmable | |
STEP_2_LIST = { | |
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', | |
'izer'=>'ize', 'bli'=>'ble', | |
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', | |
'ization'=>'ize', 'ation'=>'ate', | |
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', | |
'ousness'=>'ous', 'aliti'=>'al', | |
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log' | |
} | |
STEP_3_LIST = { | |
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', | |
'ical'=>'ic', 'ful'=>'', 'ness'=>'' | |
} | |
SUFFIX_1_REGEXP = /( | |
ational | | |
tional | | |
enci | | |
anci | | |
izer | | |
bli | | |
alli | | |
entli | | |
eli | | |
ousli | | |
ization | | |
ation | | |
ator | | |
alism | | |
iveness | | |
fulness | | |
ousness | | |
aliti | | |
iviti | | |
biliti | | |
logi)$/x | |
SUFFIX_2_REGEXP = /( | |
al | | |
ance | | |
ence | | |
er | | |
ic | | |
able | | |
ible | | |
ant | | |
ement | | |
ment | | |
ent | | |
ou | | |
ism | | |
ate | | |
iti | | |
ous | | |
ive | | |
ize)$/x | |
C = "[^aeiou]" #consonant | |
V = "[aeiouy]" #vowel | |
CC = "#{C}(?>[^aeiouy]*)" #consonant sequence | |
VV = "#{V}(?>[aeiou]*)" #vowel sequence | |
MGR0 = /^(#{CC})?#{VV}#{CC}/o #[cc]vvcc... is m>0 | |
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o #[cc]vvcc[vv] is m=1 | |
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o #[cc]vvccvvcc... is m>1 | |
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem | |
PCT_MARKS = ['\.', '!', '\?', ':', '\(', '\)', ' - '] # trailing characters for removal to allow stemming | |
def stem_porter | |
# check for trailing characters | |
mark = false | |
w = String.new(self) | |
char = w[-1,1] | |
if w[-1,1].match(/(#{PCT_MARKS.join('|')})/) | |
w = w.chop | |
mark = true | |
end | |
return w if w.length < 3 | |
# now map initial y to Y so that the patterns never treat it as vowel | |
w[0] = 'Y' if w[0] == ?y | |
# Step 1a | |
if w =~ /(ss|i)es$/ | |
w = $` + $1 | |
elsif w =~ /([^s])s$/ | |
w = $` + $1 | |
end | |
# Step 1b | |
if w =~ /eed$/ | |
w.chop! if $` =~ MGR0 | |
elsif w =~ /(ed|ing)$/ | |
stem = $` | |
if stem =~ VOWEL_IN_STEM | |
w = stem | |
case w | |
when /(at|bl|iz)$/ then w << "e" | |
when /([^aeiouylsz])\1$/ then w.chop! | |
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e" | |
end | |
end | |
end | |
if w =~ /y$/ | |
stem = $` | |
w = stem + "i" if stem =~ VOWEL_IN_STEM | |
end | |
# Step 2 | |
if w =~ SUFFIX_1_REGEXP | |
stem = $` | |
suffix = $1 | |
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n" | |
if stem =~ MGR0 | |
w = stem + STEP_2_LIST[suffix] | |
end | |
end | |
# Step 3 | |
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/ | |
stem = $` | |
suffix = $1 | |
if stem =~ MGR0 | |
w = stem + STEP_3_LIST[suffix] | |
end | |
end | |
# Step 4 | |
if w =~ SUFFIX_2_REGEXP | |
stem = $` | |
if stem =~ MGR1 | |
w = stem | |
end | |
elsif w =~ /(s|t)(ion)$/ | |
stem = $` + $1 | |
if stem =~ MGR1 | |
w = stem | |
end | |
end | |
# Step 5 | |
if w =~ /e$/ | |
stem = $` | |
if (stem =~ MGR1) || | |
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o) | |
w = stem | |
end | |
end | |
if w =~ /ll$/ && w =~ MGR1 | |
w.chop! | |
end | |
# and turn initial Y back to y | |
w[0] = 'y' if w[0] == ?Y | |
# put the trailing character back | |
if mark | |
w + char | |
else | |
w | |
end | |
end | |
alias stem stem_porter | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aber | |
als | |
am | |
an | |
auch | |
auf | |
aus | |
bei | |
bin | |
bis | |
bist | |
da | |
dadurch | |
daher | |
darum | |
das | |
daß | |
dass | |
dein | |
deine | |
dem | |
den | |
der | |
des | |
dessen | |
deshalb | |
die | |
dies | |
dieser | |
dieses | |
doch | |
dort | |
du | |
durch | |
ein | |
eine | |
einem | |
einen | |
einer | |
eines | |
er | |
es | |
euer | |
eure | |
für | |
hatte | |
hatten | |
hattest | |
hattet | |
hier hinter | |
ich | |
ihr | |
ihre | |
im | |
in | |
ist | |
ja | |
jede | |
jedem | |
jeden | |
jeder | |
jedes | |
jener | |
jenes | |
jetzt | |
kann | |
kannst | |
können | |
könnt | |
machen | |
mein | |
meine | |
mit | |
muß | |
mußt | |
musst | |
müssen | |
müßt | |
nach | |
nachdem | |
nein | |
nicht | |
nun | |
oder | |
seid | |
sein | |
seine | |
sich | |
sie | |
sind | |
soll | |
sollen | |
sollst | |
sollt | |
sonst | |
soweit | |
sowie | |
und | |
unser unsere | |
unter | |
vom | |
von | |
vor | |
wann | |
warum | |
was | |
weiter | |
weitere | |
wenn | |
wer | |
werde | |
werden | |
werdet | |
weshalb | |
wie | |
wieder | |
wieso | |
wir | |
wird | |
wirst | |
wo | |
woher | |
wohin | |
zu | |
zum | |
zur | |
über |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a | |
about | |
above | |
across | |
after | |
afterwards | |
again | |
against | |
all | |
almost | |
alone | |
along | |
already | |
also | |
although | |
always | |
am | |
among | |
amongst | |
amoungst | |
amount | |
an | |
and | |
another | |
any | |
anyhow | |
anyone | |
anything | |
anyway | |
anywhere | |
are | |
around | |
as | |
at | |
back | |
be | |
became | |
because | |
become | |
becomes | |
becoming | |
been | |
before | |
beforehand | |
behind | |
being | |
below | |
beside | |
besides | |
between | |
beyond | |
bill | |
both | |
bottom | |
but | |
by | |
call | |
can | |
cannot | |
cant | |
co | |
computer | |
con | |
could | |
couldnt | |
cry | |
de | |
describe | |
detail | |
do | |
done | |
down | |
due | |
during | |
each | |
eg | |
eight | |
either | |
eleven | |
else | |
elsewhere | |
empty | |
enough | |
etc | |
even | |
ever | |
every | |
everyone | |
everything | |
everywhere | |
except | |
few | |
fifteen | |
fify | |
fill | |
find | |
fire | |
first | |
five | |
for | |
former | |
formerly | |
forty | |
found | |
four | |
from | |
front | |
full | |
further | |
get | |
give | |
go | |
had | |
has | |
hasnt | |
have | |
he | |
hence | |
her | |
here | |
hereafter | |
hereby | |
herein | |
hereupon | |
hers | |
herse” | |
him | |
himse” | |
his | |
how | |
however | |
hundred | |
i | |
ie | |
if | |
in | |
inc | |
indeed | |
interest | |
into | |
is | |
it | |
its | |
itse” | |
keep | |
last | |
latter | |
latterly | |
least | |
less | |
ltd | |
made | |
many | |
may | |
me | |
meanwhile | |
might | |
mill | |
mine | |
more | |
moreover | |
most | |
mostly | |
move | |
much | |
must | |
my | |
myse” | |
name | |
namely | |
neither | |
never | |
nevertheless | |
next | |
nine | |
no | |
nobody | |
none | |
noone | |
nor | |
not | |
nothing | |
now | |
nowhere | |
of | |
off | |
often | |
on | |
once | |
one | |
only | |
onto | |
or | |
other | |
others | |
otherwise | |
our | |
ours | |
ourselves | |
out | |
over | |
own | |
part | |
per | |
perhaps | |
please | |
put | |
rather | |
re | |
same | |
see | |
seem | |
seemed | |
seeming | |
seems | |
serious | |
several | |
she | |
should | |
show | |
side | |
since | |
sincere | |
six | |
sixty | |
so | |
some | |
somehow | |
someone | |
something | |
sometime | |
sometimes | |
somewhere | |
still | |
such | |
system | |
take | |
ten | |
than | |
that | |
the | |
their | |
them | |
themselves | |
then | |
thence | |
there | |
thereafter | |
thereby | |
therefore | |
therein | |
thereupon | |
these | |
they | |
thick | |
thin | |
third | |
this | |
those | |
though | |
three | |
through | |
throughout | |
thru | |
thus | |
to | |
together | |
too | |
top | |
toward | |
towards | |
twelve | |
twenty | |
two | |
un | |
under | |
until | |
up | |
upon | |
us | |
very | |
via | |
was | |
we | |
well | |
were | |
what | |
whatever | |
when | |
whence | |
whenever | |
where | |
whereafter | |
whereas | |
whereby | |
wherein | |
whereupon | |
wherever | |
whether | |
which | |
while | |
whither | |
who | |
whoever | |
whole | |
whom | |
whose | |
why | |
will | |
with | |
within | |
without | |
would | |
yet | |
you | |
your | |
yours | |
yourself | |
yourselves |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
これ | |
それ | |
あれ | |
この | |
その | |
あの | |
ここ | |
そこ | |
あそこ | |
こちら | |
どこ | |
だれ | |
なに | |
なん | |
何 | |
私 | |
貴方 | |
貴方方 | |
我々 | |
私達 | |
あの人 | |
あのかた | |
彼女 | |
彼 | |
です | |
あります | |
おります | |
います | |
は | |
が | |
の | |
に | |
を | |
で | |
え | |
から | |
まで | |
より | |
も | |
どの | |
と | |
し | |
それで | |
しかし |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment