johnmyleswhite · September 6, 2013 16:07
diff --git a/porter.jl b/porter.jl
 # step1ab!() gets rid of plurals and -ed or -ing. e.g.
 #
 #  caresses  ->  caress
 #  ponies    ->  poni
 #  ties      ->  ti
 #  caress    ->  caress
 #  cats      ->  cat
 #
 #  feed      ->  feed
 #  agreed    ->  agree
 #  disabled  ->  disable
 #
 #  matting   ->  mat
 #  mating    ->  mate
 #  meeting   ->  meet
 #  milling   ->  mill
 #  messing   ->  mess
 #
 #  meetings  ->  meet
 #
 # TODO: Need to return j, k
 function step1ab!(b::ASCIIString,
                  k0::Integer = 1,
                  k::Integer = length(b))
    if b[k] == 's'
        if ends(b, "sses", 4, k0, k)
            k -= 2
            # Instead of deleting text, it seems like
            # some characters are just being ignored.
            # TODO: Consider adding whitespace instead
        elseif ends(b, "ies", 3, k0, k)
            # Introduce whitespace?
            setto!(b, "i  ", 3, k - 2, k)
        end
        elseif b[k - 1] != 's'
            k -= 1
            # Instead of deleting text, it seems like
            # some characters are just being ignored.
        end
    end
    if ends(b, "eed", 3, k0, k)
        if m(b, k0, k) > 0
            k -= 1
        elseif ends(b, "ed", 2, k0, k) || ends(b, "ing", 3, k0, k) && vowelinstem(b, 1, k)
            # "debated"
            # "debating"
            # Here the index trick seems much more effective than whitespace
            if ends(b, "at", 2, k0, k)
                setto!(b, "ate", 3, k - 2, k)
            elseif ends(b, "bl", 2, k0, k)
                setto!(b, "ble", 3, k - 2, k)
            elseif ends(b, "iz", 3, k0, k)
                setto!(b, "ize", 3, k - 2, k)
            elseif doubleconsonant(b, k, k0)
                k -= 1 # Introduce whitespace here?
                ch = b[k]
                if ch == 'l' || ch == 's' || ch == 'z'
                    k += 1
                end
            elseif m(b, k0, k) == 1 && cvc(k) # TODO: Be careful
                setto!(b, "e", 1, k, k)
            end
        end
    end
    return k
 end

 # step1c!() turns terminal y to i when there is another vowel in the stem
 function step1c!(b::ASCIIString,
                 k0::Integer = 1,
                 k::Integer = length(b))
    if ends(b, "y", 1) && vowelinstem(b, k0, k)
        b.data[k] = 'i'
    end
 end


 # step2!() maps double suffices to single ones. so -ization
 # ( = -ize plus -ation) maps to -ize etc.
 # note that the string before the suffix must give m() > 0.

 function step2!(b::ASCIIString, ko::Integer = 1, k::Integer = length(b))
    if b[k - 1] == 'a'
        if ends(b, "ational", 7)
            r(3, "ate") # TODO: Fix r calls
            break
        end
        if ends(b, "tional", 6)
            r(4, "tion") # TODO: Fix r calls
            break
        end
    elseif b[k - 1] == 'c'
        if ends(b, "enci", 4)
            r(4, "ence") # TODO: Fix r calls
            break
        end
        if ends(b, "anci", 4)
            r(4, "ance") # TODO: Fix r calls
            break
        end
    elseif b[k - 1] == 'e'
        if ends(b, "izer", 4)
            r(3, "ize") # TODO: Fix r calls
            break
        end
    # DEPARTURE
    # To match the published algorithm, replace this line with
    # case 'l': if (ends(4, "abli")) { r(4, able"); break; }
    elseif b[k - 1] == 'l'
        if ends(b, "bli", 3)
            r(3, "ble") # TODO: Fix r calls
            break
        end
        if ends(b, "alli", 4)
            r(2, "al") # TODO: Fix r calls
            break
        end
        if ends(b, "entli", 5)
            r(3, "ent") # TODO: Fix r calls
            break
        end
        if ends(b, "eli", 3)
            r(1, "e") # TODO: Fix r calls
            break
        end
        if ends(b, "ousli", 5)
            r(3, "ous") # TODO: Fix r calls
            break
        end
    elseif b[k - 1] == 'o'
        if ends(b, "ization", 7)
            r(3, "ize") # TODO: Fix r calls
            break
        if ends(b, "ation", 5)
            r(3, "ate") # TODO: Fix r calls
            break
        if ends(b, "ator", 4)
            r(3, "ate") # TODO: Fix r calls
            break
    elseif b[k - 1] == 's'
        if ends(b, "alism", 5)
            r(2, "al") # TODO: Fix r calls
            break
        end
        if ends(b, "iveness", 7)
            r(3, "ive") # TODO: Fix r calls
            break
        end
        if ends(b, "fulness", 7)
            r(3, "ful") # TODO: Fix r calls
            break
        end
        if ends(b, "ousness", 7)
            r(3, "ous") # TODO: Fix r calls
            break
        end
    elseif b[k - 1] == 't'
        if ends(b, "aliti", 5)
            r(2, "al") # TODO: Fix r calls
            break
        end
        if ends(b, "iviti", 5)
            r(3, "ive") # TODO: Fix r calls
            break
        end
        if ends(b, "biliti", 6)
            r(3, "ble") # TODO: Fix r calls
            break
        end
    # DEPARTURE
    # To match the published algorithm, delete this line
    elseif b[k - 1] == 'g'
        if ends(b, "logi", 4)
            r(3, "log") # TODO: Fix r calls
            break
        end
    end
 end

 # step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
 function step3()
    switch (b[k])
 {
    case 'e': if (ends(5" "icate")) { r(2" "ic"); break; }
              if (ends(5" "ative")) { r(0" ""); break; }
              if (ends(5" "alize")) { r(2" "al"); break; }
              break;
    case 'i': if (ends(5" "iciti")) { r(2" "ic"); break; }
              break;
    case 'l': if (ends(4" "ical")) { r(2" "ic"); break; }
              if (ends(3" "ful")) { r(0" ""); break; }
              break;
    case 's': if (ends(4" "ness")) { r(0" ""); break; }
              break;
 end

 # step4() takes off -ant, -ence etc., in context <c>vcvc<v>.

 function step4()
    if b[k - 1] == 'a'
        if ends(2, "al")
            break
            return
        end
    elseif b[k - 1] == 'c'
        if ends(4, "ance")
            break
        end
        if ends(4, "ence")
            break
            return
        end
    elseif b[k - 1] == 'e'
        if ends(2, "er")
            break
            return
        end
    elseif b[k - 1] == 'i'
        if ends(2, "ic")
            break
            return
        end
    elseif b[k - 1] == 'l'
        if ends(4, "able")
            break
        end
        if ends(4, "ible")
            break
            return
        end
    elseif b[k - 1] == 'n'
        if ends(3, "ant")
            break
        end
        if ends(5, "ement")
            break
        end
        if ends(4, "ment")
            break
        end
        if ends(3, "ent")
            break
            return
        end
    elseif b[k - 1] == 'o'
        if ends(3, "ion") && j >= 0 && b[j] == 's' || b[j] == 't'
            break
        end
        # takes care of -ous
        if ends(2, "ou")
            break
            return
        end
    elseif b[k - 1] == 's'
        if ends(3, "ism")
            break
            return
        end
    elseif b[k - 1] == 't'
        if ends(3, "ate")
            break
        end
        if ends(3, "iti")
            break
            return
        end
    elseif b[k - 1] == 'u'
        if ends(3, "ous")
            break
            return
        end
    elseif b[k - 1] == 'v'
        if ends(3, "ive")
            break
            return
        end
    elseif b[k - 1] == 'z'
        if ends(3, "ize")
            break
            return
        end
    else
        return
    end

    if m() > 1
        k = j
    end
 end

 # step5!() removes a final -e if m() > 1, and changes -ll to -l if m() > 1
 function step5!(b::ASCIIString)
    j = k
    if b[k] == 'e'
        a = m(b, ......)
        if a > 1 || a == 1 && !cvc(k - 1)
            k -= 1
        end
    end
    if b[k] == 'l' && doubleconsonant(k) && m() > 1
        k -= 1
    end
 end

 # In stem(p, i, j), p is a char pointer, and the string to be stemmed is from
 # p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
 # character of a string, (p[j+1] == '\0'). The stemmer adjusts the
 # characters p[i] ... p[j] and returns the new end-point of the string, k.
 # Stemming never increases word length, so i <= k <= j. To turn the stemmer
 # into a module, declare 'stem' as extern, and delete the remainder of this
 # file.

 function stem(b::String, i::Integer, j::Integer)
    k = j # End of string
    k0 = i # Start of string

    # DEPARTURE
    #
    # With this line, strings of length 1 or 2 don't go through the
    # stemming process, although no mention is made of this in the
    # published algorithm. Remove the line to match the published
    # algorithm.
    if k <= k0 + 1
        return k
    end

    step1ab!()
    step1c!()
    step2!()
    step3!()
    step4!()
    step5!()

    return k
 end
	# step1ab!() gets rid of plurals and -ed or -ing. e.g.
	#
	# caresses -> caress
	# ponies -> poni
	# ties -> ti
	# caress -> caress
	# cats -> cat
	#
	# feed -> feed
	# agreed -> agree
	# disabled -> disable
	#
	# matting -> mat
	# mating -> mate
	# meeting -> meet
	# milling -> mill
	# messing -> mess
	#
	# meetings -> meet
	#
	# TODO: Need to return j, k
	function step1ab!(b::ASCIIString,
	k0::Integer = 1,
	k::Integer = length(b))
	if b[k] == 's'
	if ends(b, "sses", 4, k0, k)
	k -= 2
	# Instead of deleting text, it seems like
	# some characters are just being ignored.
	# TODO: Consider adding whitespace instead
	elseif ends(b, "ies", 3, k0, k)
	# Introduce whitespace?
	setto!(b, "i ", 3, k - 2, k)
	end
	elseif b[k - 1] != 's'
	k -= 1
	# Instead of deleting text, it seems like
	# some characters are just being ignored.
	end
	end
	if ends(b, "eed", 3, k0, k)
	if m(b, k0, k) > 0
	k -= 1
	elseif ends(b, "ed", 2, k0, k) \|\| ends(b, "ing", 3, k0, k) && vowelinstem(b, 1, k)
	# "debated"
	# "debating"
	# Here the index trick seems much more effective than whitespace
	if ends(b, "at", 2, k0, k)
	setto!(b, "ate", 3, k - 2, k)
	elseif ends(b, "bl", 2, k0, k)
	setto!(b, "ble", 3, k - 2, k)
	elseif ends(b, "iz", 3, k0, k)
	setto!(b, "ize", 3, k - 2, k)
	elseif doubleconsonant(b, k, k0)
	k -= 1 # Introduce whitespace here?
	ch = b[k]
	if ch == 'l' \|\| ch == 's' \|\| ch == 'z'
	k += 1
	end
	elseif m(b, k0, k) == 1 && cvc(k) # TODO: Be careful
	setto!(b, "e", 1, k, k)
	end
	end
	end
	return k
	end

	# step1c!() turns terminal y to i when there is another vowel in the stem
	function step1c!(b::ASCIIString,
	k0::Integer = 1,
	k::Integer = length(b))
	if ends(b, "y", 1) && vowelinstem(b, k0, k)
	b.data[k] = 'i'
	end
	end


	# step2!() maps double suffices to single ones. so -ization
	# ( = -ize plus -ation) maps to -ize etc.
	# note that the string before the suffix must give m() > 0.

	function step2!(b::ASCIIString, ko::Integer = 1, k::Integer = length(b))
	if b[k - 1] == 'a'
	if ends(b, "ational", 7)
	r(3, "ate") # TODO: Fix r calls
	break
	end
	if ends(b, "tional", 6)
	r(4, "tion") # TODO: Fix r calls
	break
	end
	elseif b[k - 1] == 'c'
	if ends(b, "enci", 4)
	r(4, "ence") # TODO: Fix r calls
	break
	end
	if ends(b, "anci", 4)
	r(4, "ance") # TODO: Fix r calls
	break
	end
	elseif b[k - 1] == 'e'
	if ends(b, "izer", 4)
	r(3, "ize") # TODO: Fix r calls
	break
	end
	# DEPARTURE
	# To match the published algorithm, replace this line with
	# case 'l': if (ends(4, "abli")) { r(4, able"); break; }
	elseif b[k - 1] == 'l'
	if ends(b, "bli", 3)
	r(3, "ble") # TODO: Fix r calls
	break
	end
	if ends(b, "alli", 4)
	r(2, "al") # TODO: Fix r calls
	break
	end
	if ends(b, "entli", 5)
	r(3, "ent") # TODO: Fix r calls
	break
	end
	if ends(b, "eli", 3)
	r(1, "e") # TODO: Fix r calls
	break
	end
	if ends(b, "ousli", 5)
	r(3, "ous") # TODO: Fix r calls
	break
	end
	elseif b[k - 1] == 'o'
	if ends(b, "ization", 7)
	r(3, "ize") # TODO: Fix r calls
	break
	if ends(b, "ation", 5)
	r(3, "ate") # TODO: Fix r calls
	break
	if ends(b, "ator", 4)
	r(3, "ate") # TODO: Fix r calls
	break
	elseif b[k - 1] == 's'
	if ends(b, "alism", 5)
	r(2, "al") # TODO: Fix r calls
	break
	end
	if ends(b, "iveness", 7)
	r(3, "ive") # TODO: Fix r calls
	break
	end
	if ends(b, "fulness", 7)
	r(3, "ful") # TODO: Fix r calls
	break
	end
	if ends(b, "ousness", 7)
	r(3, "ous") # TODO: Fix r calls
	break
	end
	elseif b[k - 1] == 't'
	if ends(b, "aliti", 5)
	r(2, "al") # TODO: Fix r calls
	break
	end
	if ends(b, "iviti", 5)
	r(3, "ive") # TODO: Fix r calls
	break
	end
	if ends(b, "biliti", 6)
	r(3, "ble") # TODO: Fix r calls
	break
	end
	# DEPARTURE
	# To match the published algorithm, delete this line
	elseif b[k - 1] == 'g'
	if ends(b, "logi", 4)
	r(3, "log") # TODO: Fix r calls
	break
	end
	end
	end

	# step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
	function step3()
	switch (b[k])
	{
	case 'e': if (ends(5" "icate")) { r(2" "ic"); break; }
	if (ends(5" "ative")) { r(0" ""); break; }
	if (ends(5" "alize")) { r(2" "al"); break; }
	break;
	case 'i': if (ends(5" "iciti")) { r(2" "ic"); break; }
	break;
	case 'l': if (ends(4" "ical")) { r(2" "ic"); break; }
	if (ends(3" "ful")) { r(0" ""); break; }
	break;
	case 's': if (ends(4" "ness")) { r(0" ""); break; }
	break;
	end

	# step4() takes off -ant, -ence etc., in context <c>vcvc<v>.

	function step4()
	if b[k - 1] == 'a'
	if ends(2, "al")
	break
	return
	end
	elseif b[k - 1] == 'c'
	if ends(4, "ance")
	break
	end
	if ends(4, "ence")
	break
	return
	end
	elseif b[k - 1] == 'e'
	if ends(2, "er")
	break
	return
	end
	elseif b[k - 1] == 'i'
	if ends(2, "ic")
	break
	return
	end
	elseif b[k - 1] == 'l'
	if ends(4, "able")
	break
	end
	if ends(4, "ible")
	break
	return
	end
	elseif b[k - 1] == 'n'
	if ends(3, "ant")
	break
	end
	if ends(5, "ement")
	break
	end
	if ends(4, "ment")
	break
	end
	if ends(3, "ent")
	break
	return
	end
	elseif b[k - 1] == 'o'
	if ends(3, "ion") && j >= 0 && b[j] == 's' \|\| b[j] == 't'
	break
	end
	# takes care of -ous
	if ends(2, "ou")
	break
	return
	end
	elseif b[k - 1] == 's'
	if ends(3, "ism")
	break
	return
	end
	elseif b[k - 1] == 't'
	if ends(3, "ate")
	break
	end
	if ends(3, "iti")
	break
	return
	end
	elseif b[k - 1] == 'u'
	if ends(3, "ous")
	break
	return
	end
	elseif b[k - 1] == 'v'
	if ends(3, "ive")
	break
	return
	end
	elseif b[k - 1] == 'z'
	if ends(3, "ize")
	break
	return
	end
	else
	return
	end

	if m() > 1
	k = j
	end
	end

	# step5!() removes a final -e if m() > 1, and changes -ll to -l if m() > 1
	function step5!(b::ASCIIString)
	j = k
	if b[k] == 'e'
	a = m(b, ......)
	if a > 1 \|\| a == 1 && !cvc(k - 1)
	k -= 1
	end
	end
	if b[k] == 'l' && doubleconsonant(k) && m() > 1
	k -= 1
	end
	end

	# In stem(p, i, j), p is a char pointer, and the string to be stemmed is from
	# p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
	# character of a string, (p[j+1] == '\0'). The stemmer adjusts the
	# characters p[i] ... p[j] and returns the new end-point of the string, k.
	# Stemming never increases word length, so i <= k <= j. To turn the stemmer
	# into a module, declare 'stem' as extern, and delete the remainder of this
	# file.

	function stem(b::String, i::Integer, j::Integer)
	k = j # End of string
	k0 = i # Start of string

	# DEPARTURE
	#
	# With this line, strings of length 1 or 2 don't go through the
	# stemming process, although no mention is made of this in the
	# published algorithm. Remove the line to match the published
	# algorithm.
	if k <= k0 + 1
	return k
	end

	step1ab!()
	step1c!()
	step2!()
	step3!()
	step4!()
	step5!()

	return k
	end
No results found