Created
September 6, 2013 16:07
-
-
Save johnmyleswhite/6466011 to your computer and use it in GitHub Desktop.
Porter stemmer translation (IN PROGRESS)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # step1ab!() gets rid of plurals and -ed or -ing. e.g. | |
| # | |
| # caresses -> caress | |
| # ponies -> poni | |
| # ties -> ti | |
| # caress -> caress | |
| # cats -> cat | |
| # | |
| # feed -> feed | |
| # agreed -> agree | |
| # disabled -> disable | |
| # | |
| # matting -> mat | |
| # mating -> mate | |
| # meeting -> meet | |
| # milling -> mill | |
| # messing -> mess | |
| # | |
| # meetings -> meet | |
| # | |
| # TODO: Need to return j, k | |
| function step1ab!(b::ASCIIString, | |
| k0::Integer = 1, | |
| k::Integer = length(b)) | |
| if b[k] == 's' | |
| if ends(b, "sses", 4, k0, k) | |
| k -= 2 | |
| # Instead of deleting text, it seems like | |
| # some characters are just being ignored. | |
| # TODO: Consider adding whitespace instead | |
| elseif ends(b, "ies", 3, k0, k) | |
| # Introduce whitespace? | |
| setto!(b, "i ", 3, k - 2, k) | |
| end | |
| elseif b[k - 1] != 's' | |
| k -= 1 | |
| # Instead of deleting text, it seems like | |
| # some characters are just being ignored. | |
| end | |
| end | |
| if ends(b, "eed", 3, k0, k) | |
| if m(b, k0, k) > 0 | |
| k -= 1 | |
| elseif ends(b, "ed", 2, k0, k) || ends(b, "ing", 3, k0, k) && vowelinstem(b, 1, k) | |
| # "debated" | |
| # "debating" | |
| # Here the index trick seems much more effective than whitespace | |
| if ends(b, "at", 2, k0, k) | |
| setto!(b, "ate", 3, k - 2, k) | |
| elseif ends(b, "bl", 2, k0, k) | |
| setto!(b, "ble", 3, k - 2, k) | |
| elseif ends(b, "iz", 3, k0, k) | |
| setto!(b, "ize", 3, k - 2, k) | |
| elseif doubleconsonant(b, k, k0) | |
| k -= 1 # Introduce whitespace here? | |
| ch = b[k] | |
| if ch == 'l' || ch == 's' || ch == 'z' | |
| k += 1 | |
| end | |
| elseif m(b, k0, k) == 1 && cvc(k) # TODO: Be careful | |
| setto!(b, "e", 1, k, k) | |
| end | |
| end | |
| end | |
| return k | |
| end | |
| # step1c!() turns terminal y to i when there is another vowel in the stem | |
| function step1c!(b::ASCIIString, | |
| k0::Integer = 1, | |
| k::Integer = length(b)) | |
| if ends(b, "y", 1) && vowelinstem(b, k0, k) | |
| b.data[k] = 'i' | |
| end | |
| end | |
| # step2!() maps double suffices to single ones. so -ization | |
| # ( = -ize plus -ation) maps to -ize etc. | |
| # note that the string before the suffix must give m() > 0. | |
| function step2!(b::ASCIIString, ko::Integer = 1, k::Integer = length(b)) | |
| if b[k - 1] == 'a' | |
| if ends(b, "ational", 7) | |
| r(3, "ate") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "tional", 6) | |
| r(4, "tion") # TODO: Fix r calls | |
| break | |
| end | |
| elseif b[k - 1] == 'c' | |
| if ends(b, "enci", 4) | |
| r(4, "ence") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "anci", 4) | |
| r(4, "ance") # TODO: Fix r calls | |
| break | |
| end | |
| elseif b[k - 1] == 'e' | |
| if ends(b, "izer", 4) | |
| r(3, "ize") # TODO: Fix r calls | |
| break | |
| end | |
| # DEPARTURE | |
| # To match the published algorithm, replace this line with | |
| # case 'l': if (ends(4, "abli")) { r(4, able"); break; } | |
| elseif b[k - 1] == 'l' | |
| if ends(b, "bli", 3) | |
| r(3, "ble") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "alli", 4) | |
| r(2, "al") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "entli", 5) | |
| r(3, "ent") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "eli", 3) | |
| r(1, "e") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "ousli", 5) | |
| r(3, "ous") # TODO: Fix r calls | |
| break | |
| end | |
| elseif b[k - 1] == 'o' | |
| if ends(b, "ization", 7) | |
| r(3, "ize") # TODO: Fix r calls | |
| break | |
| if ends(b, "ation", 5) | |
| r(3, "ate") # TODO: Fix r calls | |
| break | |
| if ends(b, "ator", 4) | |
| r(3, "ate") # TODO: Fix r calls | |
| break | |
| elseif b[k - 1] == 's' | |
| if ends(b, "alism", 5) | |
| r(2, "al") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "iveness", 7) | |
| r(3, "ive") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "fulness", 7) | |
| r(3, "ful") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "ousness", 7) | |
| r(3, "ous") # TODO: Fix r calls | |
| break | |
| end | |
| elseif b[k - 1] == 't' | |
| if ends(b, "aliti", 5) | |
| r(2, "al") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "iviti", 5) | |
| r(3, "ive") # TODO: Fix r calls | |
| break | |
| end | |
| if ends(b, "biliti", 6) | |
| r(3, "ble") # TODO: Fix r calls | |
| break | |
| end | |
| # DEPARTURE | |
| # To match the published algorithm, delete this line | |
| elseif b[k - 1] == 'g' | |
| if ends(b, "logi", 4) | |
| r(3, "log") # TODO: Fix r calls | |
| break | |
| end | |
| end | |
| end | |
| # step3() deals with -ic-, -full, -ness etc. similar strategy to step2. | |
| function step3() | |
| switch (b[k]) | |
| { | |
| case 'e': if (ends(5" "icate")) { r(2" "ic"); break; } | |
| if (ends(5" "ative")) { r(0" ""); break; } | |
| if (ends(5" "alize")) { r(2" "al"); break; } | |
| break; | |
| case 'i': if (ends(5" "iciti")) { r(2" "ic"); break; } | |
| break; | |
| case 'l': if (ends(4" "ical")) { r(2" "ic"); break; } | |
| if (ends(3" "ful")) { r(0" ""); break; } | |
| break; | |
| case 's': if (ends(4" "ness")) { r(0" ""); break; } | |
| break; | |
| end | |
| # step4() takes off -ant, -ence etc., in context <c>vcvc<v>. | |
| function step4() | |
| if b[k - 1] == 'a' | |
| if ends(2, "al") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'c' | |
| if ends(4, "ance") | |
| break | |
| end | |
| if ends(4, "ence") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'e' | |
| if ends(2, "er") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'i' | |
| if ends(2, "ic") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'l' | |
| if ends(4, "able") | |
| break | |
| end | |
| if ends(4, "ible") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'n' | |
| if ends(3, "ant") | |
| break | |
| end | |
| if ends(5, "ement") | |
| break | |
| end | |
| if ends(4, "ment") | |
| break | |
| end | |
| if ends(3, "ent") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'o' | |
| if ends(3, "ion") && j >= 0 && b[j] == 's' || b[j] == 't' | |
| break | |
| end | |
| # takes care of -ous | |
| if ends(2, "ou") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 's' | |
| if ends(3, "ism") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 't' | |
| if ends(3, "ate") | |
| break | |
| end | |
| if ends(3, "iti") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'u' | |
| if ends(3, "ous") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'v' | |
| if ends(3, "ive") | |
| break | |
| return | |
| end | |
| elseif b[k - 1] == 'z' | |
| if ends(3, "ize") | |
| break | |
| return | |
| end | |
| else | |
| return | |
| end | |
| if m() > 1 | |
| k = j | |
| end | |
| end | |
| # step5!() removes a final -e if m() > 1, and changes -ll to -l if m() > 1 | |
| function step5!(b::ASCIIString) | |
| j = k | |
| if b[k] == 'e' | |
| a = m(b, ......) | |
| if a > 1 || a == 1 && !cvc(k - 1) | |
| k -= 1 | |
| end | |
| end | |
| if b[k] == 'l' && doubleconsonant(k) && m() > 1 | |
| k -= 1 | |
| end | |
| end | |
| # In stem(p, i, j), p is a char pointer, and the string to be stemmed is from | |
| # p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last | |
| # character of a string, (p[j+1] == '\0'). The stemmer adjusts the | |
| # characters p[i] ... p[j] and returns the new end-point of the string, k. | |
| # Stemming never increases word length, so i <= k <= j. To turn the stemmer | |
| # into a module, declare 'stem' as extern, and delete the remainder of this | |
| # file. | |
| function stem(b::String, i::Integer, j::Integer) | |
| k = j # End of string | |
| k0 = i # Start of string | |
| # DEPARTURE | |
| # | |
| # With this line, strings of length 1 or 2 don't go through the | |
| # stemming process, although no mention is made of this in the | |
| # published algorithm. Remove the line to match the published | |
| # algorithm. | |
| if k <= k0 + 1 | |
| return k | |
| end | |
| step1ab!() | |
| step1c!() | |
| step2!() | |
| step3!() | |
| step4!() | |
| step5!() | |
| return k | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment