Last active
May 26, 2020 18:10
-
-
Save h8nor/21b8cc8da032701cd664526a6f9c02de to your computer and use it in GitHub Desktop.
Porter Stemmer RUS in VISUAL BASIC 6
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
' It follow the algorithm "stem_Unicode.sbl" definition presented in: | |
' Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, no. 3, | |
' pp 130-137, (http://snowball.tartarus.org/algorithms/russian/stemmer.html) | |
' TO USE THE PROGRAM CALL THE FUNCTION PORTERSTEMMERRU. | |
' THE WORD TO BE STEMMED SHOULD BE PASSED AS THE ARGUEMENT. | |
' THE STRING RETURNED BY THE FUNCTION IS THE STEMMED WORD. | |
Option Explicit | |
Option Base 0 | |
'12345678901234567890123456789012345bopoh13@ya67890123456789012345678901234567890 | |
Public Function PorterStemmerRU(ByVal word As String) As String ' Test failed | |
' Переписано с http://snowball.tartarus.org/algorithms/russian/stemmer.html | |
'2 Причастие совершенного вида | |
Const PERFECTiveGERUNDs As String = "[иы]в [иы]вши [иы]вшись" | |
'1 Возвратное | |
Const REFLEXive As String = "сь ся" | |
'2 Причастие | |
Const PARTICIPLEs As String = "[иы]вш ующ" | |
'1 Прилагательное | |
Const ADJECTive As String = "[ео]го [иы]ми [ео]му " _ | |
& "[еиоы]е [еиоы]й [еиоы]м [еоую]ю [иы]х [ая]я" | |
'2 Глагол | |
Const VERBs As String = "ен ена ено ены ишь ую ует уют ите [еу]йте " _ | |
& "[иы]ла [иы]ли [иы]ло [иы]ть [еу]й [иы]л [иы]м [иыя]т ю" | |
'1 Имя существительное | |
Const NOUN As String = "ией ием иям иях иями [ая]ми [ео]в [иь]е [еи]и " _ | |
& "[еио]й [аеоя]м [ая]х [иь]ю [иь]я а е и й о у ь ы ю я" | |
'1 Прилагательное привосходной степени | |
Const SUPERLATive As String = "ейш ейше" | |
'1 Словообразующее окончание в r2 | |
Const DERIVATIONAL As String = "ост ость" | |
Dim rV As Byte, r2 As Byte ' r1 As Byte | |
If Len(word) > 0 Then word = Replace(LCase(word), "*", "") Else Exit Function | |
' rV - начало области слова после первой гласной (Если гласных нет = 0) | |
' r1 - начало области слова "Гласная-Согласная" с начала слова | |
r2 = FindRegions(rV, word) ' r2 - начало области "Гласная-Согласная" после r1 | |
' [Шаг 1] Если существует окончание PERFECTIVE GERUND – удалить и завершить | |
If Not RemoveEndings(word, Array(Replace(PERFECTiveGERUNDs, "[иы]", ""), _ | |
PERFECTiveGERUNDs), rV) Then | |
' Если существует окончание REFLEXIVE – удалить | |
RemoveEndings word, REFLEXive, rV | |
' Удалить одно из окончаний и завершить: PARTICIPLE + ADJECTIVE, VERB, NOUN | |
If RemoveEndings(word, ADJECTive, rV) Then | |
RemoveEndings word, Array("ем нн вш ющ щ", PARTICIPLEs), rV | |
Else | |
If Not RemoveEndings(word, Array("ешь нно ем ли ны ть " _ | |
& "ете йте ла на ло но ет ют й л н", VERBs), rV) Then _ | |
RemoveEndings word, NOUN, rV ' не УЮТ и не МЛЕЮТ, но БЕСЕДуют | |
End If | |
End If | |
' [Шаг 2] Если слово окончивается на "и" - удалить | |
RemoveEndings word, "и", rV | |
' [ШАГ 3] Если существует окончание DERIVATIONAL в r2 - удалить | |
RemoveEndings word, DERIVATIONAL, r2 | |
' [ШАГ 4] Удалить одно из окончаний слова: (Н)Н + SUPERLATIVE, (Н)Н, Ь | |
RemoveEndings word, SUPERLATive, rV | |
If RemoveEndings(word, "нн", rV) Then word = word & "н" | |
RemoveEndings word, "ь", rV | |
PorterStemmerRU = word | |
End Function | |
Private Function RemoveEndings(ByRef word As String, ByVal regex As Variant, _ | |
ByVal region As Byte) As Boolean ' Удалить окончание (самое длинное) | |
Dim rAff As Byte, prefix As String, regMatch As Variant | |
prefix = Mid(word, 1, IIf(region, region, 1) - 1) ' prefix <- region | |
word = Mid(word, Len(prefix) + 1) | |
If IsArray(regex) Then | |
For Each regMatch In Split(regex(0)) | |
If word Like "*[ая]" & regMatch Then ' Если найден аффикс | |
word = Left(word, Len(word) - Len(regMatch)) | |
RemoveEndings = True: Exit For | |
End If | |
Next regMatch: regex = regex(1) | |
End If | |
If Not RemoveEndings Then | |
For Each regMatch In Split(regex) | |
rAff = InStr(regMatch, "]") + 1 ' rAff - начало области после [list] | |
On Error Resume Next | |
For region = 2 To rAff - 2 | |
If rAff < 2 Then region = 1: rAff = 2 ' Если нет [list] | |
If word Like "*" & Mid(regMatch, region, 1) & Mid(regMatch, rAff) Then | |
regMatch = Mid(regMatch, region, 1) & Mid(regMatch, rAff) | |
word = Left(word, Len(word) - Len(regMatch)) | |
RemoveEndings = True: Exit For | |
End If: If region = 1 Then Exit For | |
Next region: If RemoveEndings Then Exit For | |
On Error GoTo 0 | |
Next regMatch | |
End If: word = prefix & word | |
End Function | |
Private Function FindRegions(ByRef rV As Byte, ByVal word As String) As Byte | |
Dim prevChar As String, Char As String, state As Byte, i As Byte | |
If isVowel(Left(word, 1)) Then rV = 2: state = 1 ' После первой гласной | |
For i = 2 To Len(word) | |
prevChar = Mid(word, i - 1, 1): Char = Mid(word, i, 1) | |
Select Case state | |
Case 0: If isVowel(Char) Then rV = i + 1: state = 1 | |
Case 1: If Not isVowel(Char) And isVowel(prevChar) Then state = 2 | |
Case 2: If Not isVowel(Char) And isVowel(prevChar) Then _ | |
FindRegions = i + 1: Exit For | |
End Select | |
Next i | |
End Function | |
Private Function isVowel(ByVal Char As String) As Boolean | |
Const VOWEL As String = "[аеёиоуыэюя]" | |
isVowel = InStr(Mid(VOWEL, 2, Len(VOWEL) - 1), Char) | |
End Function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment