dingsdax · January 17, 2021 20:00
diff --git a/helpers.rb b/helpers.rb
 #adds helper methods to standard ruby classes

 class String
  #make a string stemmable
  include Stemmable
  
  #get array of n-grams of string
  def ngrams(len = 1)
    ngrams = []
    len = size if len > size
    (0..size - len).each do |n|
      ng = self[n...(n + len)]
      ngrams.push(ng)
    end
    ngrams
  end
  
  #get last character of string
  def last
    self[size-1,1]
  end
 end

 class Array
  #return hash with frequencies of items in an array
  def freqs
    inject(Hash.new(0)){ |hash,x|
      hash[x] +=1
      hash
    }
  end
  
  #return hash with array elems as keys, values=1
  def exists
    inject(Hash.new(0)){ |hash,x|
      hash[x] =1
      hash
    }
  end
  
  #sum up array items
  def sum
    inject(nil) { |sum,x| sum ? sum+x : x }
  end
 end

 class Hash
  #sort hash by values descending
  def sort_num_value
    to_a.sort_by {|key,value| -value}
  end
  
  #convert all hash values to arrays
  def mk_ary_val
    self.each_pair do |k,v|
      self[k] = [v]
      self
    end
  end
 end

 class Float
  #round float to decimal places
  alias_method :round_orig, :round
  def round(n=0)
    (self * (10.0 ** n)).round_orig * (10.0 ** (-n))
  end
 end

 class Dir
  #recurse directory, return array of files,dirs
  def self.recurse(path='.', ext='*', &block)
    list = []
    stoplist = ['.', '..']
    Dir.foreach(path) do |f|
      next if stoplist.include?(f)
      filename = (path == '.' ? f : path + '/' + f)
      next if f.match(/^\./)
      list << filename
      block.call(filename) if block
      if FileTest.directory?(filename)
        list.concat( Dir.recurse(filename, &block) )
      end
    end
    list
  end
 end
diff --git a/indexer.rb b/indexer.rb
 # encoding: UTF-8

 require 'fileutils'
 require 'iconv'
 require 'stemmer.rb'
 require 'helpers.rb'

 class Indexer
  
  #constant for stop words
  STOP_WORDS = []
  
  def initialize
    #data structures
    @ngrams_doc = Hash.new #storing corpus ngrams + their doc-freq
    @docs = 0 #nr of docs processed
  
    # options
    @ngrams = 3 #nr of ngrams
    @stemming = true #do porter stemming on words
    @stopwording = true #remove stop words
    @upperbound = 1 #ngram must be in at most 90% of docs
    @lowerbound = 0 #ngram must be in at least 20% of docs
    @round = 6 #number of decimal places for tf-idf
    @name = 'indexer' #give the index a name
  
    # directories & files
    @wdir = Dir.getwd #working directory
    @stw_file = 'stop_words_en.txt' #english stop words file, default
    @tmp_dir = Dir.getwd + '/tmp' #dir for tmp data
    @out_file = Dir.getwd + '/output.arff' #dir for tmp data
    @ngrams_file = @tmp_dir + '/ngrams.dtf' # data store corpus ngrams + df
  end
  
  attr_accessor :ngrams, :stemming, :stopwording, :upperbound, :lowerbound, :name, :stw_file, :out_file, :ngrams_file, :wdir
  
  #import stop words from file
  def get_stop_words
    f = File.open @stw_file 
    f.each_line do |l|
      STOP_WORDS << l.strip
    end
  end
  
  #remove stop-words with regexp from string, downcased automatically
  def rm_stop_words str
    @stopwording ? str.downcase.gsub(/(#{STOP_WORDS.join('|')})/, '') : str
  end
  
  #do some porter stemming
  def stem str
    if @stemming
      out = String.new
      str.split(' ').each { |s| out << s.stem << ' ' }
      out[0..out.size-2]
    else
      str
    end
  end
  
  #process a file, iterate through file, get ngrams + freq
  def process_file filename
    puts "processing: #{filename}"
    
    # get class assignment
    doc_class = get_class filename
    puts "class assignment: #{doc_class}"
    
    ngrams_cur = Hash.new #storing current doc ngrams + freq
    doc_cur = Hash.new #storing doc-freq of ngram, always 1
    file_str = String.new #for storing file contents
    
    #open file and put whole file into a string
    file_str = IO.read(filename)
    
    # fix encoding errors
    ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
    file_str = ic.iconv(file_str)
    
    #trim whitspaces: multiple, trailing, leading
    file_str = file_str.downcase.gsub(/\s/, " ").gsub("'",'"').squeeze(" ").strip
    
    #get ngrams + frequencies
    # replace single quotes with double quotes, escaping can be troublesome for some ngrams
    ngrams_cur = file_str.ngrams(@ngrams).freqs
    
    #get ngram existance
    doc_cur = file_str.ngrams(@ngrams).exists
    
    #merge results for global ngrams & doc-freq
    @ngrams_doc = @ngrams_doc.merge(doc_cur) do |key,val_old,val_new|
       val_old + val_new
    end
    
    #store ngrams of cur doc
    fn = File.basename(filename)
    c_file = @tmp_dir + '/' + doc_class + '_' + fn
    File.open(c_file, 'w+') do |f|
      Marshal.dump(ngrams_cur, f)
    end 
    
    # increase nr of docs processed
    @docs += 1
  end
  
  # need to be overwritten
  def get_class f
    return 'braaack'
  end
  
  def process_dir
    #create tmp directory
    FileUtils.mkdir_p @tmp_dir
    
    # document class
    doc_class = String.new
      
    #change working dir & traverse through dir
    Dir.chdir(@wdir)
    Dir.recurse(@wdir) do |f|
      if !File.directory?(f)
        process_file(f)
      end
    end
    
    #cut ngrams to lower and upper bounds
    a = @ngrams_doc.select { |k,v| 
      v.to_f/@docs > @lowerbound && v.to_f/@docs < @upperbound
    }
    @ngrams_doc = Hash[*a.flatten]
    
    #store all ngrams of corpus    
    File.open(@ngrams_file, 'w') do |f|  
      f.write Marshal.dump(@ngrams_doc.to_a)
    end
    
    puts "file #{@ngrams_file} written"
  end
  
  #delete tmp directory and files
  def cleanup
    if File.directory? @tmp_dir
      FileUtils.rm_r @tmp_dir
    end
  end
  
  #build the output arff file
  def build_arff
    ngrams = Array.new
    attributes = [['filename','STRING'],['klass','STRING']]
    File.open(@ngrams_file) do |f|
      ngrams = Marshal.load(f)
    end    
    ngrams.each_with_index do |ngram, i|
      attributes << [ngram[0], 'NUMERIC']
    end    
    File.open(@out_file, 'w') do |out|
      
      #start output
      r = '@RELATION ' + @name
      out.puts r
      out.puts ''
      
      #write attributes, escape '
      attributes.each do |a|
        #o = '@ATTRIBUTE \'' + a[0].to_s.gsub("\\","\\\\'").gsub("\'","\\\'") + '\' ' + a[1].to_s
        o = '@ATTRIBUTE \'' + a[0].to_s + '\' ' + a[1].to_s 
        out.puts o
      end
      
      #start data section
      out.puts ''
      out.puts '@DATA'
      
      #enter tmp dir and get all tf files
      Dir.chdir @tmp_dir
      Dir.glob("*.*").each do |f|
        
        # lignore ngrams file
        if f == @ngrams_file.split('/').last
          next
        end
        
        #add filename and class name to instance
        instance = ['0 ' + f.split('_')[1],'1 ' + f.split('_')[0]]
        ngrams_cur = Hash.new
        
        File.open(f) do |c|
          ngrams_cur = Marshal.load(c)
        end
        
        #sum of all tf in document
        dtf = ngrams_cur.values.sum
        
        #for each ngram to appear in output
        ngrams.each_with_index do |ngram, i|
          if ngrams_cur.has_key? ngram[0]
            #calculate tf-idf
            x = ((ngrams_cur[ngram[0]].to_f/dtf)*(Math.log(@docs/ngram[1].to_f))).round(@round)
            instance << (i+2).to_s + ' ' + x.to_s
          end
        end
        
        # sort instances for sparse output
        instance.sort {|x,y| x.to_i <=> y.to_i }
        
        #write data section
        out << '{'
        instance.each_with_index do |e,i|
          if (i == (instance.size-1))
            e = e.to_s + '}'
            out.puts e
          else
            out << e.to_s + ', '
          end
        end
      end
    end
    puts "file: #{@out_file} written"
  end
 end
diff --git a/Rakefile.rb b/Rakefile.rb
 require './lib/indexer.rb'

 # rake task to build index
 task :buildindex do
  
  # mixin to overwrite get_class method
  Indexer.class_eval do 
    def get_class f
      f.split('/')[-2]# get class assignment implicitly with folder structure (first hierarchy level)
    end
  end
  
  # create Indexer
  i = Indexer.new
  
  # set options
  i.wdir = Dir.getwd + '/corpora/my_text_corpus' # working directory, root directory of corpus
  i.ngrams = 4 # n in ngrams
  i.stemming = true # use stemming?
  i.stopwording = true # use stopwords?
  i.upperbound = 0.4 # upper percentage of docs in which a certain ngram has to appear
  i.lowerbound = 0.01 # lower percentage of docs in which a certain ngram has to appear
  i.name = 'output' # name for the relation in the output file
  i.stw_file = 'stop_words_en.txt' # file containing stopwords
  i.out_file = Dir.getwd + '/output.arff' # name of index file
  i.process_dir # create temporary data files containing n-grams, frequencies
  i.build_arff # build the output arff file
  i.cleanup # delete temporary files
  
 end
diff --git a/stemmer.rb b/stemmer.rb
 # ruby porter stemmer by ray pareda, additions dingsdax
 module Stemmable

  STEP_2_LIST = {
    'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
    'izer'=>'ize', 'bli'=>'ble',
    'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
    'ization'=>'ize', 'ation'=>'ate',
    'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
    'ousness'=>'ous', 'aliti'=>'al',
    'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
  }
  
  STEP_3_LIST = {
    'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
    'ical'=>'ic', 'ful'=>'', 'ness'=>''
  }

  SUFFIX_1_REGEXP = /(
                    ational  |
                    tional   |
                    enci     |
                    anci     |
                    izer     |
                    bli      |
                    alli     |
                    entli    |
                    eli      |
                    ousli    |
                    ization  |
                    ation    |
                    ator     |
                    alism    |
                    iveness  |
                    fulness  |
                    ousness  |
                    aliti    |
                    iviti    |
                    biliti   |
                    logi)$/x

  SUFFIX_2_REGEXP = /(
                      al       |
                      ance     |
                      ence     |
                      er       |
                      ic       | 
                      able     |
                      ible     |
                      ant      |
                      ement    |
                      ment     |
                      ent      |
                      ou       |
                      ism      |
                      ate      |
                      iti      |
                      ous      |
                      ive      |
                      ize)$/x

  C = "[^aeiou]" #consonant
  V = "[aeiouy]" #vowel
  CC = "#{C}(?>[^aeiouy]*)" #consonant sequence
  VV = "#{V}(?>[aeiou]*)" #vowel sequence

  MGR0 = /^(#{CC})?#{VV}#{CC}/o #[cc]vvcc... is m>0
  MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o #[cc]vvcc[vv] is m=1
  MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o #[cc]vvccvvcc... is m>1
  VOWEL_IN_STEM   = /^(#{CC})?#{V}/o # vowel in stem
  PCT_MARKS = ['\.', '!', '\?', ':', '\(', '\)', ' - '] # trailing characters for removal to allow stemming

  def stem_porter
    
    # check for trailing characters
    mark = false
    w = String.new(self)
    char = w[-1,1] 
    if w[-1,1].match(/(#{PCT_MARKS.join('|')})/)
      w = w.chop
      mark = true
    end
    
    return w if w.length < 3
    
    # now map initial y to Y so that the patterns never treat it as vowel
    w[0] = 'Y' if w[0] == ?y
    
    # Step 1a
    if w =~ /(ss|i)es$/
      w = $` + $1
    elsif w =~ /([^s])s$/ 
      w = $` + $1
    end

    # Step 1b
    if w =~ /eed$/
      w.chop! if $` =~ MGR0 
    elsif w =~ /(ed|ing)$/
      stem = $`
      if stem =~ VOWEL_IN_STEM 
        w = stem
 	      case w
          when /(at|bl|iz)$/ then w << "e"
          when /([^aeiouylsz])\1$/ then w.chop!
          when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
        end
      end
    end

    if w =~ /y$/ 
      stem = $`
      w = stem + "i" if stem =~ VOWEL_IN_STEM 
    end

    # Step 2
    if w =~ SUFFIX_1_REGEXP
      stem = $`
      suffix = $1
      # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
      if stem =~ MGR0
        w = stem + STEP_2_LIST[suffix]
      end
    end

    # Step 3
    if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
      stem = $`
      suffix = $1
      if stem =~ MGR0
        w = stem + STEP_3_LIST[suffix]
      end
    end

    # Step 4
    if w =~ SUFFIX_2_REGEXP
      stem = $`
      if stem =~ MGR1
        w = stem
      end
    elsif w =~ /(s|t)(ion)$/
      stem = $` + $1
      if stem =~ MGR1
        w = stem
      end
    end

    #  Step 5
    if w =~ /e$/ 
      stem = $`
      if (stem =~ MGR1) ||
          (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
        w = stem
      end
    end
    if w =~ /ll$/ && w =~ MGR1
      w.chop!
    end

    # and turn initial Y back to y
    w[0] = 'y' if w[0] == ?Y
    
    # put the trailing character back 
    if mark
      w + char
    else
      w
    end
  end
  alias stem stem_porter
 end
diff --git a/stop_words_de.txt b/stop_words_de.txt
 aber
 als
 am
 an
 auch
 auf
 aus
 bei
 bin
 bis
 bist
 da
 dadurch
 daher
 darum
 das
 daß
 dass
 dein
 deine
 dem
 den
 der
 des
 dessen
 deshalb
 die
 dies
 dieser
 dieses
 doch
 dort
 du
 durch
 ein
 eine
 einem
 einen
 einer
 eines
 er
 es
 euer
 eure
 für
 hatte
 hatten
 hattest
 hattet
 hier	hinter
 ich
 ihr
 ihre
 im
 in
 ist
 ja
 jede
 jedem
 jeden
 jeder
 jedes
 jener
 jenes
 jetzt
 kann
 kannst
 können
 könnt
 machen
 mein
 meine
 mit
 muß
 mußt
 musst
 müssen
 müßt
 nach
 nachdem
 nein
 nicht
 nun
 oder
 seid
 sein
 seine
 sich
 sie
 sind
 soll
 sollen
 sollst
 sollt
 sonst
 soweit
 sowie
 und
 unser	unsere
 unter
 vom
 von
 vor
 wann
 warum
 was
 weiter
 weitere
 wenn
 wer
 werde
 werden
 werdet
 weshalb
 wie
 wieder
 wieso
 wir
 wird
 wirst
 wo
 woher
 wohin
 zu
 zum
 zur
 über
diff --git a/stop_words_en.txt b/stop_words_en.txt
 a
 about
 above
 across
 after
 afterwards
 again
 against
 all
 almost
 alone
 along
 already
 also
 although
 always
 am
 among
 amongst
 amoungst
 amount
 an
 and
 another
 any
 anyhow
 anyone
 anything
 anyway
 anywhere
 are
 around
 as
 at
 back
 be
 became
 because
 become
 becomes
 becoming
 been
 before
 beforehand
 behind
 being
 below
 beside
 besides
 between
 beyond
 bill
 both
 bottom
 but
 by
 call
 can
 cannot
 cant
 co
 computer
 con
 could
 couldnt
 cry
 de
 describe
 detail
 do
 done
 down
 due
 during
 each
 eg
 eight
 either
 eleven
 else
 elsewhere
 empty
 enough
 etc
 even
 ever
 every
 everyone
 everything
 everywhere
 except
 few
 fifteen
 fify
 fill
 find
 fire
 first
 five
 for
 former
 formerly
 forty
 found
 four
 from
 front
 full
 further
 get
 give
 go
 had
 has
 hasnt
 have
 he
 hence
 her
 here
 hereafter
 hereby
 herein
 hereupon
 hers
 herse”
 him
 himse”
 his
 how
 however
 hundred
 i
 ie
 if
 in
 inc
 indeed
 interest
 into
 is
 it
 its
 itse”
 keep
 last
 latter
 latterly
 least
 less
 ltd
 made
 many
 may
 me
 meanwhile
 might
 mill
 mine
 more
 moreover
 most
 mostly
 move
 much
 must
 my
 myse”
 name
 namely
 neither
 never
 nevertheless
 next
 nine
 no
 nobody
 none
 noone
 nor
 not
 nothing
 now
 nowhere
 of
 off
 often
 on
 once
 one
 only
 onto
 or
 other
 others
 otherwise
 our
 ours
 ourselves
 out
 over
 own
 part
 per
 perhaps
 please
 put
 rather
 re
 same
 see
 seem
 seemed
 seeming
 seems
 serious
 several
 she
 should
 show
 side
 since
 sincere
 six
 sixty
 so
 some
 somehow
 someone
 something
 sometime
 sometimes
 somewhere
 still
 such
 system
 take
 ten
 than
 that
 the
 their
 them
 themselves
 then
 thence
 there
 thereafter
 thereby
 therefore
 therein
 thereupon
 these
 they
 thick
 thin
 third
 this
 those
 though
 three
 through
 throughout
 thru
 thus
 to
 together
 too
 top
 toward
 towards
 twelve
 twenty
 two
 un
 under
 until
 up
 upon
 us
 very
 via
 was
 we
 well
 were
 what
 whatever
 when
 whence
 whenever
 where
 whereafter
 whereas
 whereby
 wherein
 whereupon
 wherever
 whether
 which
 while
 whither
 who
 whoever
 whole
 whom
 whose
 why
 will
 with
 within
 without
 would
 yet
 you
 your
 yours
 yourself
 yourselves
diff --git a/stop_words_jp.txt b/stop_words_jp.txt
 これ
 それ
 あれ
 この
 その
 あの
 ここ
 そこ
 あそこ
 こちら
 どこ
 だれ
 なに
 なん
 何
 私
 貴方
 貴方方
 我々
 私達
 あの人
 あのかた
 彼女
 彼
 です
 あります
 おります
 います
 は
 が
 の
 に
 を
 で
 え
 から
 まで
 より
 も
 どの
 と
 し
 それで
 しかし
	#adds helper methods to standard ruby classes

	class String
	#make a string stemmable
	include Stemmable

	#get array of n-grams of string
	def ngrams(len = 1)
	ngrams = []
	len = size if len > size
	(0..size - len).each do \|n\|
	ng = self[n...(n + len)]
	ngrams.push(ng)
	end
	ngrams
	end

	#get last character of string
	def last
	self[size-1,1]
	end
	end

	class Array
	#return hash with frequencies of items in an array
	def freqs
	inject(Hash.new(0)){ \|hash,x\|
	hash[x] +=1
	hash
	}
	end

	#return hash with array elems as keys, values=1
	def exists
	inject(Hash.new(0)){ \|hash,x\|
	hash[x] =1
	hash
	}
	end

	#sum up array items
	def sum
	inject(nil) { \|sum,x\| sum ? sum+x : x }
	end
	end

	class Hash
	#sort hash by values descending
	def sort_num_value
	to_a.sort_by {\|key,value\| -value}
	end

	#convert all hash values to arrays
	def mk_ary_val
	self.each_pair do \|k,v\|
	self[k] = [v]
	self
	end
	end
	end

	class Float
	#round float to decimal places
	alias_method :round_orig, :round
	def round(n=0)
	(self * (10.0 ** n)).round_orig * (10.0 ** (-n))
	end
	end

	class Dir
	#recurse directory, return array of files,dirs
	def self.recurse(path='.', ext='*', &block)
	list = []
	stoplist = ['.', '..']
	Dir.foreach(path) do \|f\|
	next if stoplist.include?(f)
	filename = (path == '.' ? f : path + '/' + f)
	next if f.match(/^\./)
	list << filename
	block.call(filename) if block
	if FileTest.directory?(filename)
	list.concat( Dir.recurse(filename, &block) )
	end
	end
	list
	end
	end
	# encoding: UTF-8

	require 'fileutils'
	require 'iconv'
	require 'stemmer.rb'
	require 'helpers.rb'

	class Indexer

	#constant for stop words
	STOP_WORDS = []

	def initialize
	#data structures
	@ngrams_doc = Hash.new #storing corpus ngrams + their doc-freq
	@docs = 0 #nr of docs processed

	# options
	@ngrams = 3 #nr of ngrams
	@stemming = true #do porter stemming on words
	@stopwording = true #remove stop words
	@upperbound = 1 #ngram must be in at most 90% of docs
	@lowerbound = 0 #ngram must be in at least 20% of docs
	@round = 6 #number of decimal places for tf-idf
	@name = 'indexer' #give the index a name

	# directories & files
	@wdir = Dir.getwd #working directory
	@stw_file = 'stop_words_en.txt' #english stop words file, default
	@tmp_dir = Dir.getwd + '/tmp' #dir for tmp data
	@out_file = Dir.getwd + '/output.arff' #dir for tmp data
	@ngrams_file = @tmp_dir + '/ngrams.dtf' # data store corpus ngrams + df
	end

	attr_accessor :ngrams, :stemming, :stopwording, :upperbound, :lowerbound, :name, :stw_file, :out_file, :ngrams_file, :wdir

	#import stop words from file
	def get_stop_words
	f = File.open @stw_file
	f.each_line do \|l\|
	STOP_WORDS << l.strip
	end
	end

	#remove stop-words with regexp from string, downcased automatically
	def rm_stop_words str
	@stopwording ? str.downcase.gsub(/(#{STOP_WORDS.join('\|')})/, '') : str
	end

	#do some porter stemming
	def stem str
	if @stemming
	out = String.new
	str.split(' ').each { \|s\| out << s.stem << ' ' }
	out[0..out.size-2]
	else
	str
	end
	end

	#process a file, iterate through file, get ngrams + freq
	def process_file filename
	puts "processing: #{filename}"

	# get class assignment
	doc_class = get_class filename
	puts "class assignment: #{doc_class}"

	ngrams_cur = Hash.new #storing current doc ngrams + freq
	doc_cur = Hash.new #storing doc-freq of ngram, always 1
	file_str = String.new #for storing file contents

	#open file and put whole file into a string
	file_str = IO.read(filename)

	# fix encoding errors
	ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
	file_str = ic.iconv(file_str)

	#trim whitspaces: multiple, trailing, leading
	file_str = file_str.downcase.gsub(/\s/, " ").gsub("'",'"').squeeze(" ").strip

	#get ngrams + frequencies
	# replace single quotes with double quotes, escaping can be troublesome for some ngrams
	ngrams_cur = file_str.ngrams(@ngrams).freqs

	#get ngram existance
	doc_cur = file_str.ngrams(@ngrams).exists

	#merge results for global ngrams & doc-freq
	@ngrams_doc = @ngrams_doc.merge(doc_cur) do \|key,val_old,val_new\|
	val_old + val_new
	end

	#store ngrams of cur doc
	fn = File.basename(filename)
	c_file = @tmp_dir + '/' + doc_class + '_' + fn
	File.open(c_file, 'w+') do \|f\|
	Marshal.dump(ngrams_cur, f)
	end

	# increase nr of docs processed
	@docs += 1
	end

	# need to be overwritten
	def get_class f
	return 'braaack'
	end

	def process_dir
	#create tmp directory
	FileUtils.mkdir_p @tmp_dir

	# document class
	doc_class = String.new

	#change working dir & traverse through dir
	Dir.chdir(@wdir)
	Dir.recurse(@wdir) do \|f\|
	if !File.directory?(f)
	process_file(f)
	end
	end

	#cut ngrams to lower and upper bounds
	a = @ngrams_doc.select { \|k,v\|
	v.to_f/@docs > @lowerbound && v.to_f/@docs < @upperbound
	}
	@ngrams_doc = Hash[*a.flatten]

	#store all ngrams of corpus
	File.open(@ngrams_file, 'w') do \|f\|
	f.write Marshal.dump(@ngrams_doc.to_a)
	end

	puts "file #{@ngrams_file} written"
	end

	#delete tmp directory and files
	def cleanup
	if File.directory? @tmp_dir
	FileUtils.rm_r @tmp_dir
	end
	end

	#build the output arff file
	def build_arff
	ngrams = Array.new
	attributes = [['filename','STRING'],['klass','STRING']]
	File.open(@ngrams_file) do \|f\|
	ngrams = Marshal.load(f)
	end
	ngrams.each_with_index do \|ngram, i\|
	attributes << [ngram[0], 'NUMERIC']
	end
	File.open(@out_file, 'w') do \|out\|

	#start output
	r = '@RELATION ' + @name
	out.puts r
	out.puts ''

	#write attributes, escape '
	attributes.each do \|a\|
	#o = '@ATTRIBUTE \'' + a[0].to_s.gsub("\\","\\\\'").gsub("\'","\\\'") + '\' ' + a[1].to_s
	o = '@ATTRIBUTE \'' + a[0].to_s + '\' ' + a[1].to_s
	out.puts o
	end

	#start data section
	out.puts ''
	out.puts '@DATA'

	#enter tmp dir and get all tf files
	Dir.chdir @tmp_dir
	Dir.glob(".").each do \|f\|

	# lignore ngrams file
	if f == @ngrams_file.split('/').last
	next
	end

	#add filename and class name to instance
	instance = ['0 ' + f.split('_')[1],'1 ' + f.split('_')[0]]
	ngrams_cur = Hash.new

	File.open(f) do \|c\|
	ngrams_cur = Marshal.load(c)
	end

	#sum of all tf in document
	dtf = ngrams_cur.values.sum

	#for each ngram to appear in output
	ngrams.each_with_index do \|ngram, i\|
	if ngrams_cur.has_key? ngram[0]
	#calculate tf-idf
	x = ((ngrams_cur[ngram[0]].to_f/dtf)*(Math.log(@docs/ngram[1].to_f))).round(@round)
	instance << (i+2).to_s + ' ' + x.to_s
	end
	end

	# sort instances for sparse output
	instance.sort {\|x,y\| x.to_i <=> y.to_i }

	#write data section
	out << '{'
	instance.each_with_index do \|e,i\|
	if (i == (instance.size-1))
	e = e.to_s + '}'
	out.puts e
	else
	out << e.to_s + ', '
	end
	end
	end
	end
	puts "file: #{@out_file} written"
	end
	end
	require './lib/indexer.rb'

	# rake task to build index
	task :buildindex do

	# mixin to overwrite get_class method
	Indexer.class_eval do
	def get_class f
	f.split('/')[-2]# get class assignment implicitly with folder structure (first hierarchy level)
	end
	end

	# create Indexer
	i = Indexer.new

	# set options
	i.wdir = Dir.getwd + '/corpora/my_text_corpus' # working directory, root directory of corpus
	i.ngrams = 4 # n in ngrams
	i.stemming = true # use stemming?
	i.stopwording = true # use stopwords?
	i.upperbound = 0.4 # upper percentage of docs in which a certain ngram has to appear
	i.lowerbound = 0.01 # lower percentage of docs in which a certain ngram has to appear
	i.name = 'output' # name for the relation in the output file
	i.stw_file = 'stop_words_en.txt' # file containing stopwords
	i.out_file = Dir.getwd + '/output.arff' # name of index file
	i.process_dir # create temporary data files containing n-grams, frequencies
	i.build_arff # build the output arff file
	i.cleanup # delete temporary files

	end
	# ruby porter stemmer by ray pareda, additions dingsdax
	module Stemmable

	STEP_2_LIST = {
	'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
	'izer'=>'ize', 'bli'=>'ble',
	'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
	'ization'=>'ize', 'ation'=>'ate',
	'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
	'ousness'=>'ous', 'aliti'=>'al',
	'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
	}

	STEP_3_LIST = {
	'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
	'ical'=>'ic', 'ful'=>'', 'ness'=>''
	}

	SUFFIX_1_REGEXP = /(
	ational \|
	tional \|
	enci \|
	anci \|
	izer \|
	bli \|
	alli \|
	entli \|
	eli \|
	ousli \|
	ization \|
	ation \|
	ator \|
	alism \|
	iveness \|
	fulness \|
	ousness \|
	aliti \|
	iviti \|
	biliti \|
	logi)$/x

	SUFFIX_2_REGEXP = /(
	al \|
	ance \|
	ence \|
	er \|
	ic \|
	able \|
	ible \|
	ant \|
	ement \|
	ment \|
	ent \|
	ou \|
	ism \|
	ate \|
	iti \|
	ous \|
	ive \|
	ize)$/x

	C = "[^aeiou]" #consonant
	V = "[aeiouy]" #vowel
	CC = "#{C}(?>[^aeiouy]*)" #consonant sequence
	VV = "#{V}(?>[aeiou]*)" #vowel sequence

	MGR0 = /^(#{CC})?#{VV}#{CC}/o #[cc]vvcc... is m>0
	MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o #[cc]vvcc[vv] is m=1
	MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o #[cc]vvccvvcc... is m>1
	VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
	PCT_MARKS = ['\.', '!', '\?', ':', '\(', '\)', ' - '] # trailing characters for removal to allow stemming

	def stem_porter

	# check for trailing characters
	mark = false
	w = String.new(self)
	char = w[-1,1]
	if w[-1,1].match(/(#{PCT_MARKS.join('\|')})/)
	w = w.chop
	mark = true
	end

	return w if w.length < 3

	# now map initial y to Y so that the patterns never treat it as vowel
	w[0] = 'Y' if w[0] == ?y

	# Step 1a
	if w =~ /(ss\|i)es$/
	w = $` + $1
	elsif w =~ /([^s])s$/
	w = $` + $1
	end

	# Step 1b
	if w =~ /eed$/
	w.chop! if $` =~ MGR0
	elsif w =~ /(ed\|ing)$/
	stem = $`
	if stem =~ VOWEL_IN_STEM
	w = stem
	case w
	when /(at\|bl\|iz)$/ then w << "e"
	when /([^aeiouylsz])\1$/ then w.chop!
	when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
	end
	end
	end

	if w =~ /y$/
	stem = $`
	w = stem + "i" if stem =~ VOWEL_IN_STEM
	end

	# Step 2
	if w =~ SUFFIX_1_REGEXP
	stem = $`
	suffix = $1
	# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
	if stem =~ MGR0
	w = stem + STEP_2_LIST[suffix]
	end
	end

	# Step 3
	if w =~ /(icate\|ative\|alize\|iciti\|ical\|ful\|ness)$/
	stem = $`
	suffix = $1
	if stem =~ MGR0
	w = stem + STEP_3_LIST[suffix]
	end
	end

	# Step 4
	if w =~ SUFFIX_2_REGEXP
	stem = $`
	if stem =~ MGR1
	w = stem
	end
	elsif w =~ /(s\|t)(ion)$/
	stem = $` + $1
	if stem =~ MGR1
	w = stem
	end
	end

	# Step 5
	if w =~ /e$/
	stem = $`
	if (stem =~ MGR1) \|\|
	(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
	w = stem
	end
	end
	if w =~ /ll$/ && w =~ MGR1
	w.chop!
	end

	# and turn initial Y back to y
	w[0] = 'y' if w[0] == ?Y

	# put the trailing character back
	if mark
	w + char
	else
	w
	end
	end
	alias stem stem_porter
	end
	aber
	als
	am
	an
	auch
	auf
	aus
	bei
	bin
	bis
	bist
	da
	dadurch
	daher
	darum
	das
	daß
	dass
	dein
	deine
	dem
	den
	der
	des
	dessen
	deshalb
	die
	dies
	dieser
	dieses
	doch
	dort
	du
	durch
	ein
	eine
	einem
	einen
	einer
	eines
	er
	es
	euer
	eure
	für
	hatte
	hatten
	hattest
	hattet
	hier hinter
	ich
	ihr
	ihre
	im
	in
	ist
	ja
	jede
	jedem
	jeden
	jeder
	jedes
	jener
	jenes
	jetzt
	kann
	kannst
	können
	könnt
	machen
	mein
	meine
	mit
	muß
	mußt
	musst
	müssen
	müßt
	nach
	nachdem
	nein
	nicht
	nun
	oder
	seid
	sein
	seine
	sich
	sie
	sind
	soll
	sollen
	sollst
	sollt
	sonst
	soweit
	sowie
	und
	unser unsere
	unter
	vom
	von
	vor
	wann
	warum
	was
	weiter
	weitere
	wenn
	wer
	werde
	werden
	werdet
	weshalb
	wie
	wieder
	wieso
	wir
	wird
	wirst
	wo
	woher
	wohin
	zu
	zum
	zur
	über
	a
	about
	above
	across
	after
	afterwards
	again
	against
	all
	almost
	alone
	along
	already
	also
	although
	always
	am
	among
	amongst
	amoungst
	amount
	an
	and
	another
	any
	anyhow
	anyone
	anything
	anyway
	anywhere
	are
	around
	as
	at
	back
	be
	became
	because
	become
	becomes
	becoming
	been
	before
	beforehand
	behind
	being
	below
	beside
	besides
	between
	beyond
	bill
	both
	bottom
	but
	by
	call
	can
	cannot
	cant
	co
	computer
	con
	could
	couldnt
	cry
	de
	describe
	detail
	do
	done
	down
	due
	during
	each
	eg
	eight
	either
	eleven
	else
	elsewhere
	empty
	enough
	etc
	even
	ever
	every
	everyone
	everything
	everywhere
	except
	few
	fifteen
	fify
	fill
	find
	fire
	first
	five
	for
	former
	formerly
	forty
	found
	four
	from
	front
	full
	further
	get
	give
	go
	had
	has
	hasnt
	have
	he
	hence
	her
	here
	hereafter
	hereby
	herein
	hereupon
	hers
	herse”
	him
	himse”
	his
	how
	however
	hundred
	i
	ie
	if
	in
	inc
	indeed
	interest
	into
	is
	it
	its
	itse”
	keep
	last
	latter
	latterly
	least
	less
	ltd
	made
	many
	may
	me
	meanwhile
	might
	mill
	mine
	more
	moreover
	most
	mostly
	move
	much
	must
	my
	myse”
	name
	namely
	neither
	never
	nevertheless
	next
	nine
	no
	nobody
	none
	noone
	nor
	not
	nothing
	now
	nowhere
	of
	off
	often
	on
	once
	one
	only
	onto
	or
	other
	others
	otherwise
	our
	ours
	ourselves
	out
	over
	own
	part
	per
	perhaps
	please
	put
	rather
	re
	same
	see
	seem
	seemed
	seeming
	seems
	serious
	several
	she
	should
	show
	side
	since
	sincere
	six
	sixty
	so
	some
	somehow
	someone
	something
	sometime
	sometimes
	somewhere
	still
	such
	system
	take
	ten
	than
	that
	the
	their
	them
	themselves
	then
	thence
	there
	thereafter
	thereby
	therefore
	therein
	thereupon
	these
	they
	thick
	thin
	third
	this
	those
	though
	three
	through
	throughout
	thru
	thus
	to
	together
	too
	top
	toward
	towards
	twelve
	twenty
	two
	un
	under
	until
	up
	upon
	us
	very
	via
	was
	we
	well
	were
	what
	whatever
	when
	whence
	whenever
	where
	whereafter
	whereas
	whereby
	wherein
	whereupon
	wherever
	whether
	which
	while
	whither
	who
	whoever
	whole
	whom
	whose
	why
	will
	with
	within
	without
	would
	yet
	you
	your
	yours
	yourself
	yourselves
	これ
	それ
	あれ
	この
	その
	あの
	ここ
	そこ
	あそこ
	こちら
	どこ
	だれ
	なに
	なん
	何
	私
	貴方
	貴方方
	我々
	私達
	あの人
	あのかた
	彼女
	彼
	です
	あります
	おります
	います
	は
	が
	の
	に
	を
	で
	え
	から
	まで
	より
	も
	どの
	と
	し
	それで
	しかし