Skip to content

Instantly share code, notes, and snippets.

@johnmyleswhite
Created September 6, 2013 16:07
Show Gist options
  • Select an option

  • Save johnmyleswhite/6466011 to your computer and use it in GitHub Desktop.

Select an option

Save johnmyleswhite/6466011 to your computer and use it in GitHub Desktop.
Porter stemmer translation (IN PROGRESS)
# step1ab!() gets rid of plurals and -ed or -ing. e.g.
#
# caresses -> caress
# ponies -> poni
# ties -> ti
# caress -> caress
# cats -> cat
#
# feed -> feed
# agreed -> agree
# disabled -> disable
#
# matting -> mat
# mating -> mate
# meeting -> meet
# milling -> mill
# messing -> mess
#
# meetings -> meet
#
# TODO: Need to return j, k
function step1ab!(b::ASCIIString,
k0::Integer = 1,
k::Integer = length(b))
if b[k] == 's'
if ends(b, "sses", 4, k0, k)
k -= 2
# Instead of deleting text, it seems like
# some characters are just being ignored.
# TODO: Consider adding whitespace instead
elseif ends(b, "ies", 3, k0, k)
# Introduce whitespace?
setto!(b, "i ", 3, k - 2, k)
end
elseif b[k - 1] != 's'
k -= 1
# Instead of deleting text, it seems like
# some characters are just being ignored.
end
end
if ends(b, "eed", 3, k0, k)
if m(b, k0, k) > 0
k -= 1
elseif ends(b, "ed", 2, k0, k) || ends(b, "ing", 3, k0, k) && vowelinstem(b, 1, k)
# "debated"
# "debating"
# Here the index trick seems much more effective than whitespace
if ends(b, "at", 2, k0, k)
setto!(b, "ate", 3, k - 2, k)
elseif ends(b, "bl", 2, k0, k)
setto!(b, "ble", 3, k - 2, k)
elseif ends(b, "iz", 3, k0, k)
setto!(b, "ize", 3, k - 2, k)
elseif doubleconsonant(b, k, k0)
k -= 1 # Introduce whitespace here?
ch = b[k]
if ch == 'l' || ch == 's' || ch == 'z'
k += 1
end
elseif m(b, k0, k) == 1 && cvc(k) # TODO: Be careful
setto!(b, "e", 1, k, k)
end
end
end
return k
end
# step1c!() turns terminal y to i when there is another vowel in the stem
function step1c!(b::ASCIIString,
k0::Integer = 1,
k::Integer = length(b))
if ends(b, "y", 1) && vowelinstem(b, k0, k)
b.data[k] = 'i'
end
end
# step2!() maps double suffices to single ones. so -ization
# ( = -ize plus -ation) maps to -ize etc.
# note that the string before the suffix must give m() > 0.
function step2!(b::ASCIIString, ko::Integer = 1, k::Integer = length(b))
if b[k - 1] == 'a'
if ends(b, "ational", 7)
r(3, "ate") # TODO: Fix r calls
break
end
if ends(b, "tional", 6)
r(4, "tion") # TODO: Fix r calls
break
end
elseif b[k - 1] == 'c'
if ends(b, "enci", 4)
r(4, "ence") # TODO: Fix r calls
break
end
if ends(b, "anci", 4)
r(4, "ance") # TODO: Fix r calls
break
end
elseif b[k - 1] == 'e'
if ends(b, "izer", 4)
r(3, "ize") # TODO: Fix r calls
break
end
# DEPARTURE
# To match the published algorithm, replace this line with
# case 'l': if (ends(4, "abli")) { r(4, able"); break; }
elseif b[k - 1] == 'l'
if ends(b, "bli", 3)
r(3, "ble") # TODO: Fix r calls
break
end
if ends(b, "alli", 4)
r(2, "al") # TODO: Fix r calls
break
end
if ends(b, "entli", 5)
r(3, "ent") # TODO: Fix r calls
break
end
if ends(b, "eli", 3)
r(1, "e") # TODO: Fix r calls
break
end
if ends(b, "ousli", 5)
r(3, "ous") # TODO: Fix r calls
break
end
elseif b[k - 1] == 'o'
if ends(b, "ization", 7)
r(3, "ize") # TODO: Fix r calls
break
if ends(b, "ation", 5)
r(3, "ate") # TODO: Fix r calls
break
if ends(b, "ator", 4)
r(3, "ate") # TODO: Fix r calls
break
elseif b[k - 1] == 's'
if ends(b, "alism", 5)
r(2, "al") # TODO: Fix r calls
break
end
if ends(b, "iveness", 7)
r(3, "ive") # TODO: Fix r calls
break
end
if ends(b, "fulness", 7)
r(3, "ful") # TODO: Fix r calls
break
end
if ends(b, "ousness", 7)
r(3, "ous") # TODO: Fix r calls
break
end
elseif b[k - 1] == 't'
if ends(b, "aliti", 5)
r(2, "al") # TODO: Fix r calls
break
end
if ends(b, "iviti", 5)
r(3, "ive") # TODO: Fix r calls
break
end
if ends(b, "biliti", 6)
r(3, "ble") # TODO: Fix r calls
break
end
# DEPARTURE
# To match the published algorithm, delete this line
elseif b[k - 1] == 'g'
if ends(b, "logi", 4)
r(3, "log") # TODO: Fix r calls
break
end
end
end
# step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
function step3()
switch (b[k])
{
case 'e': if (ends(5" "icate")) { r(2" "ic"); break; }
if (ends(5" "ative")) { r(0" ""); break; }
if (ends(5" "alize")) { r(2" "al"); break; }
break;
case 'i': if (ends(5" "iciti")) { r(2" "ic"); break; }
break;
case 'l': if (ends(4" "ical")) { r(2" "ic"); break; }
if (ends(3" "ful")) { r(0" ""); break; }
break;
case 's': if (ends(4" "ness")) { r(0" ""); break; }
break;
end
# step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
function step4()
if b[k - 1] == 'a'
if ends(2, "al")
break
return
end
elseif b[k - 1] == 'c'
if ends(4, "ance")
break
end
if ends(4, "ence")
break
return
end
elseif b[k - 1] == 'e'
if ends(2, "er")
break
return
end
elseif b[k - 1] == 'i'
if ends(2, "ic")
break
return
end
elseif b[k - 1] == 'l'
if ends(4, "able")
break
end
if ends(4, "ible")
break
return
end
elseif b[k - 1] == 'n'
if ends(3, "ant")
break
end
if ends(5, "ement")
break
end
if ends(4, "ment")
break
end
if ends(3, "ent")
break
return
end
elseif b[k - 1] == 'o'
if ends(3, "ion") && j >= 0 && b[j] == 's' || b[j] == 't'
break
end
# takes care of -ous
if ends(2, "ou")
break
return
end
elseif b[k - 1] == 's'
if ends(3, "ism")
break
return
end
elseif b[k - 1] == 't'
if ends(3, "ate")
break
end
if ends(3, "iti")
break
return
end
elseif b[k - 1] == 'u'
if ends(3, "ous")
break
return
end
elseif b[k - 1] == 'v'
if ends(3, "ive")
break
return
end
elseif b[k - 1] == 'z'
if ends(3, "ize")
break
return
end
else
return
end
if m() > 1
k = j
end
end
# step5!() removes a final -e if m() > 1, and changes -ll to -l if m() > 1
function step5!(b::ASCIIString)
j = k
if b[k] == 'e'
a = m(b, ......)
if a > 1 || a == 1 && !cvc(k - 1)
k -= 1
end
end
if b[k] == 'l' && doubleconsonant(k) && m() > 1
k -= 1
end
end
# In stem(p, i, j), p is a char pointer, and the string to be stemmed is from
# p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
# character of a string, (p[j+1] == '\0'). The stemmer adjusts the
# characters p[i] ... p[j] and returns the new end-point of the string, k.
# Stemming never increases word length, so i <= k <= j. To turn the stemmer
# into a module, declare 'stem' as extern, and delete the remainder of this
# file.
function stem(b::String, i::Integer, j::Integer)
k = j # End of string
k0 = i # Start of string
# DEPARTURE
#
# With this line, strings of length 1 or 2 don't go through the
# stemming process, although no mention is made of this in the
# published algorithm. Remove the line to match the published
# algorithm.
if k <= k0 + 1
return k
end
step1ab!()
step1c!()
step2!()
step3!()
step4!()
step5!()
return k
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment