Created
September 15, 2008 20:12
-
-
Save aurelian/10929 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| - | |
| - - o | |
| - a | |
| - m | |
| - l | |
| - i | |
| - v | |
| - s | |
| - e | |
| - c | |
| - - al | |
| - ai | |
| - pe | |
| - la | |
| - "\xC3\xAEn" | |
| - cu | |
| - de | |
| - ca | |
| - eu | |
| - tu | |
| - el | |
| - ea | |
| - ei | |
| - mi | |
| - "m\xC4\x83" | |
| - "\xC8\x9Bi" | |
| - te | |
| - "\xC3\xAEl" | |
| - "\xC3\xAEi" | |
| - ne | |
| - ni | |
| - "v\xC4\x83" | |
| - vi | |
| - le | |
| - li | |
| - se | |
| - ta | |
| - sa | |
| - ce | |
| - fi | |
| - am | |
| - au | |
| - "a\xC8\x99" | |
| - ar | |
| - oi | |
| - om | |
| - or | |
| - va | |
| - "\xC8\x99i" | |
| - ci | |
| - "c\xC4\x83" | |
| - nu | |
| - ul | |
| - ia | |
| - "\xC3\x8En" | |
| - "\xC8\x98i" | |
| - in | |
| - "s\xC4\x83" | |
| - da | |
| - si | |
| - un | |
| - "d\xC4\x83" | |
| - ii | |
| - an | |
| - il | |
| - ba | |
| - - cel | |
| - cea | |
| - cei | |
| - ale | |
| - sub | |
| - din | |
| - mai | |
| - cum | |
| - noi | |
| - voi | |
| - ele | |
| - mie | |
| - "\xC3\xAEmi" | |
| - "\xC8\x9Bie" | |
| - "\xC3\xAE\xC8\x9Bi" | |
| - lui | |
| - lor | |
| - "\xC3\xAE\xC8\x99i" | |
| - sie | |
| - meu | |
| - mea | |
| - mei | |
| - "t\xC4\x83u" | |
| - "t\xC4\x83i" | |
| - "s\xC4\x83u" | |
| - "s\xC4\x83i" | |
| - "\xC4\x83la" | |
| - "\xC4\x83ia" | |
| - aia | |
| - cui | |
| - una | |
| - alt | |
| - "c\xC3\xA2t" | |
| - tot | |
| - unu | |
| - doi | |
| - opt | |
| - era | |
| - fiu | |
| - fii | |
| - fie | |
| - fim | |
| - "a\xC8\x9Bi" | |
| - are | |
| - "o\xC8\x9Bi" | |
| - vei | |
| - vom | |
| - vor | |
| - dar | |
| - iar | |
| - sau | |
| - ori | |
| - "a\xC8\x99a" | |
| - pot | |
| - dat | |
| - pus | |
| - ani | |
| - mii | |
| - lei | |
| - i-a | |
| - l-a | |
| - isi | |
| - "i\xC8\x99i" | |
| - s-a | |
| - loc | |
| - asa | |
| - azi | |
| - cam | |
| - "c\xC3\xAEt" | |
| - fel | |
| - mod | |
| - - unui | |
| - unei | |
| - unor | |
| - cele | |
| - "f\xC4\x83r\xC4\x83" | |
| - spre | |
| - prin | |
| - "p\xC3\xA2n\xC4\x83" | |
| - "dup\xC4\x83" | |
| - mult | |
| - mine | |
| - tine | |
| - "nou\xC4\x83" | |
| - "vou\xC4\x83" | |
| - sine | |
| - mele | |
| - tale | |
| - sale | |
| - "\xC4\x83sta" | |
| - asta | |
| - alea | |
| - acel | |
| - acei | |
| - acea | |
| - cine | |
| - care | |
| - unde | |
| - "c\xC3\xA2nd" | |
| - unul | |
| - unii | |
| - alta | |
| - "alt\xC4\x83" | |
| - "al\xC8\x9Bi" | |
| - alte | |
| - vreo | |
| - "c\xC3\xA2t\xC4\x83" | |
| - "c\xC3\xA2\xC8\x9Bi" | |
| - "c\xC3\xA2te" | |
| - "at\xC3\xA2t" | |
| - "to\xC8\x9Bi" | |
| - ceva | |
| - doua | |
| - trei | |
| - "\xC8\x99ase" | |
| - noua | |
| - zece | |
| - sunt | |
| - "e\xC8\x99ti" | |
| - este | |
| - eram | |
| - erai | |
| - erau | |
| - "fi\xC8\x9Bi" | |
| - fost | |
| - avem | |
| - avea | |
| - "ve\xC8\x9Bi" | |
| - nici | |
| - "\xC3\xAEns\xC4\x83" | |
| - deci | |
| - "de\xC8\x99i" | |
| - "dac\xC4\x83" | |
| - ieri | |
| - mare | |
| - doar | |
| - spus | |
| - acum | |
| - face | |
| - avut | |
| - bine | |
| - "fa\xC8\x9B\xC4\x83" | |
| - "\xC3\xAEnc\xC4\x83" | |
| - nu-l | |
| - anii | |
| - zeci | |
| - de-a | |
| - fara | |
| - "f\xC4\x83ra" | |
| - "far\xC4\x83" | |
| - le-a | |
| - l-au | |
| - abia | |
| - pana | |
| - "p\xC3\xA2na" | |
| - "pan\xC4\x83" | |
| - i-au | |
| - s-au | |
| - si-a | |
| - "\xC8\x99i-a" | |
| - luat | |
| - "dou\xC4\x83" | |
| - pare | |
| - desi | |
| - sint | |
| - inca | |
| - "inc\xC4\x83" | |
| - cand | |
| - sa-l | |
| - "s\xC4\x83-l" | |
| - aici | |
| - atat | |
| - deja | |
| - dupa | |
| - mica | |
| - "mic\xC4\x83" | |
| - "dat\xC4\x83" | |
| - data | |
| - apoi | |
| - "at\xC3\xAEt" | |
| - ceea | |
| - "c\xC3\xAEnd" | |
| - "c\xC3\xAEte" | |
| - "c\xC3\xAE\xC8\x9Bi" | |
| - daca | |
| - "\xC3\xAEnca" | |
| - "\xC3\xAEntr" | |
| - l-am | |
| - "p\xC3\xAEn\xC4\x83" | |
| - plus | |
| - prea | |
| - s-ar | |
| - "s\xC4\x83-i" | |
| - "s\xC3\xAEnt" | |
| - - "ni\xC8\x99te" | |
| - celui | |
| - celei | |
| - celor | |
| - "c\xC4\x83tre" | |
| - "l\xC3\xA2ng\xC4\x83" | |
| - peste | |
| - "dec\xC3\xA2t" | |
| - "mult\xC4\x83" | |
| - "mul\xC8\x9Bi" | |
| - multe | |
| - "pu\xC8\x9Bin" | |
| - "sie\xC8\x99i" | |
| - "\xC4\x83\xC8\x99tia" | |
| - astea | |
| - acest | |
| - acela | |
| - "\xC4\x83luia" | |
| - aceia | |
| - "\xC4\x83lora" | |
| - aceea | |
| - "\xC4\x83leia" | |
| - acele | |
| - "c\xC4\x83rui" | |
| - "c\xC4\x83rei" | |
| - "c\xC4\x83ror" | |
| - cuiva | |
| - orice | |
| - unele | |
| - unuia | |
| - uneia | |
| - unora | |
| - altul | |
| - "al\xC8\x9Bii" | |
| - altui | |
| - altei | |
| - altor | |
| - vreun | |
| - "c\xC3\xA2tor" | |
| - "at\xC3\xA2ta" | |
| - "at\xC3\xA2\xC8\x9Bi" | |
| - "c\xC3\xA2tva" | |
| - "toat\xC4\x83" | |
| - toate | |
| - totul | |
| - nimic | |
| - patru | |
| - cinci | |
| - "\xC8\x99apte" | |
| - doime | |
| - ambii | |
| - prima | |
| - "era\xC8\x9Bi" | |
| - fiind | |
| - "ave\xC8\x9Bi" | |
| - aveam | |
| - aveai | |
| - aveau | |
| - "\xC3\xAEnc\xC3\xA2t" | |
| - poate | |
| - putea | |
| - chiar | |
| - "f\xC4\x83cut" | |
| - parte | |
| - spune | |
| - numai | |
| - le-au | |
| - "\xC8\x98i-au" | |
| - "\xC8\x99i-au" | |
| - "s\xC4\x83-\xC8\x99i" | |
| - sa-si | |
| - "s\xC4\x83-si" | |
| - "sa-\xC8\x99i" | |
| - "c\xC4\x83-\xC8\x99i" | |
| - "ca-\xC8\x99i" | |
| - ca-si | |
| - "c\xC4\x83-si" | |
| - langa | |
| - "l\xC3\xA2nga" | |
| - catre | |
| - facem | |
| - facut | |
| - multi | |
| - putin | |
| - acolo | |
| - altii | |
| - "adic\xC4\x83" | |
| - anume | |
| - atare | |
| - "at\xC3\xAEti" | |
| - cumva | |
| - "dec\xC3\xAEt" | |
| - dintr | |
| - "\xC3\xAEnc\xC3\xAEt" | |
| - "\xC3\xAEntre" | |
| - mereu | |
| - "poat\xC4\x83" | |
| - "s\xC4\x83-mi" | |
| - "s\xC4\x83-\xC8\x9Bi" | |
| - - despre | |
| - pentru | |
| - dintre | |
| - "\xC3\xAEnspre" | |
| - foarte | |
| - "pu\xC8\x9Bin\xC4\x83" | |
| - "pu\xC8\x9Bini" | |
| - "pu\xC8\x9Bine" | |
| - destul | |
| - destui | |
| - "\xC3\xAEnsumi" | |
| - "\xC3\xAEns\xC4\x83mi" | |
| - "\xC3\xAEnsu\xC8\x9Bi" | |
| - "\xC3\xAEns\xC4\x83\xC8\x9Bi" | |
| - "\xC3\xAEnsu\xC8\x99i" | |
| - "\xC3\xAEns\xC4\x83\xC8\x99i" | |
| - "\xC3\xAEn\xC8\x99ine" | |
| - "\xC3\xAEnsene" | |
| - "\xC3\xAEn\xC8\x99iv\xC4\x83" | |
| - "\xC3\xAEnsev\xC4\x83" | |
| - "\xC3\xAEn\xC8\x99i\xC8\x99i" | |
| - "\xC3\xAEnse\xC8\x99i" | |
| - "\xC3\xAEnsele" | |
| - nostru | |
| - "no\xC8\x99tri" | |
| - vostru | |
| - "vo\xC8\x99tri" | |
| - acesta | |
| - "\xC4\x83stuia" | |
| - "\xC4\x83stora" | |
| - "\xC4\x83steia" | |
| - "ace\xC8\x99ti" | |
| - aceste | |
| - acelui | |
| - acelea | |
| - acelor | |
| - acelei | |
| - cineva | |
| - oricui | |
| - altele | |
| - altuia | |
| - alteia | |
| - altora | |
| - vreuna | |
| - "c\xC3\xA2tora" | |
| - "at\xC3\xA2\xC8\x9Bia" | |
| - "at\xC3\xA2tea" | |
| - "at\xC3\xA2tor" | |
| - "oric\xC3\xA2t" | |
| - "c\xC3\xA2\xC8\x9Biva" | |
| - "c\xC3\xA2teva" | |
| - cutare | |
| - nimeni | |
| - treime | |
| - sutime | |
| - ambele | |
| - "\xC3\xAEndoit" | |
| - "\xC3\xAEnt\xC3\xA2ia" | |
| - primul | |
| - primii | |
| - primei | |
| - suntem | |
| - "avea\xC8\x9Bi" | |
| - "a\xC8\x99adar" | |
| - "totu\xC8\x99i" | |
| - atunci | |
| - astfel | |
| - "exist\xC4\x83" | |
| - asupra | |
| - doilea | |
| - ultima | |
| - intr-o | |
| - "\xC3\xAEntr-o" | |
| - niciun | |
| - nicuna | |
| - "facu\xC8\x9Bi" | |
| - facuti | |
| - cativa | |
| - "c\xC3\xA2tiva" | |
| - "ca\xC8\x9Biva" | |
| - putina | |
| - "pu\xC8\x9Bina" | |
| - "putin\xC4\x83" | |
| - altfel | |
| - "ast\xC4\x83zi" | |
| - "at\xC3\xAE\xC8\x9Bia" | |
| - "c\xC4\x83reia" | |
| - "c\xC4\x83rora" | |
| - "c\xC4\x83ruia" | |
| - "c\xC3\xAEteva" | |
| - "c\xC3\xAE\xC8\x9Biva" | |
| - "\xC3\xAEnapoi" | |
| - oarece | |
| - oricum | |
| - "s\xC3\xAEntem" | |
| - tocmai | |
| - uneori | |
| - - printre | |
| - "destul\xC4\x83" | |
| - destule | |
| - "noastr\xC4\x83" | |
| - noastre | |
| - "voastr\xC4\x83" | |
| - voastre | |
| - "ace\xC8\x99tia" | |
| - aceasta | |
| - acestea | |
| - acestui | |
| - acestor | |
| - "aceast\xC4\x83" | |
| - acestei | |
| - acelora | |
| - aceleia | |
| - "acela\xC8\x99i" | |
| - "aceia\xC8\x99i" | |
| - "aceea\xC8\x99i" | |
| - oricine | |
| - vreunul | |
| - vreunii | |
| - vreunui | |
| - vreunei | |
| - vreunor | |
| - oricare | |
| - fiecare | |
| - "at\xC3\xA2tora" | |
| - "oric\xC3\xA2t\xC4\x83" | |
| - "oric\xC3\xA2\xC8\x9Bi" | |
| - "oric\xC3\xA2te" | |
| - "c\xC3\xA2torva" | |
| - tuturor | |
| - altceva | |
| - "nim\xC4\x83nui" | |
| - "am\xC3\xA2ndoi" | |
| - ambilor | |
| - ambelor | |
| - "\xC3\xAEntreit" | |
| - "\xC3\xAEnsutit" | |
| - "\xC3\xAEnt\xC3\xA2iul" | |
| - primele | |
| - "sunte\xC8\x9Bi" | |
| - trebuie | |
| - aproape | |
| - miliard | |
| - ultimul | |
| - "\xC3\xAEntr-un" | |
| - intr-un | |
| - sunteti | |
| - "al\xC4\x83turi" | |
| - "\xC3\xAEnainte" | |
| - oarecui | |
| - "s\xC3\xAEnte\xC8\x9Bi" | |
| - - acestuia | |
| - acestora | |
| - acesteia | |
| - "acelea\xC8\x99i" | |
| - "cel\xC4\x83lalt" | |
| - "ceilal\xC8\x9Bi" | |
| - "cealalt\xC4\x83" | |
| - altcuiva | |
| - vreunele | |
| - vreunuia | |
| - vreuneia | |
| - vreunora | |
| - "oric\xC4\x83rui" | |
| - "oric\xC4\x83rei" | |
| - "oric\xC4\x83ror" | |
| - "fiec\xC4\x83rui" | |
| - "fiec\xC4\x83rei" | |
| - "oric\xC3\xA2tor" | |
| - oarecare | |
| - "am\xC3\xA2ndou\xC4\x83" | |
| - primului | |
| - primilor | |
| - primelor | |
| - niciulul | |
| - milioane | |
| - asemenea | |
| - deasupra | |
| - oarecine | |
| - printr-o | |
| - - "aceluia\xC8\x99i" | |
| - "acelora\xC8\x99i" | |
| - "aceleia\xC8\x99i" | |
| - celuilalt | |
| - celelalte | |
| - altcineva | |
| - "oric\xC4\x83ruia" | |
| - "oric\xC4\x83reia" | |
| - "oric\xC4\x83rora" | |
| - "fiec\xC4\x83ruia" | |
| - "fiec\xC4\x83reia" | |
| - "oric\xC3\xA2tora" | |
| - "am\xC3\xA2nduror" | |
| - "\xC3\xAEmpotriva" | |
| - niciodata | |
| - - "celorlal\xC8\x9Bi" | |
| - celeilalte | |
| - celorlalte | |
| - "am\xC3\xA2ndurora" | |
| - | |
| - | |
| - - "dumneavoastr\xC4\x83" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'yaml' | |
| def remove_stop_words(text) | |
| stop_words= YAML.load_file 'ro-stop_words5.yml' | |
| clean_buff= [] | |
| for match in text.downcase.scan /\b([\w\-']*)\b/iu | |
| word= match[0] | |
| next if word.nil? || word.strip == "" | |
| size= word.unpack("U*").size | |
| next if stop_words[size] && stop_words[size].include?(word) | |
| clean_buff << word | |
| end | |
| clean_buff.join(" ") | |
| end | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $htmlenc2unicode= { | |
| " " => 160 ,# U+00A0 | |
| "¡" => 161 ,# U+00A1 | |
| "¢" => 162 ,# U+00A2 | |
| "£" => 163 ,# U+00A3 | |
| "¤" => 164 ,# U+00A4 | |
| "¥" => 165 ,# U+00A5 | |
| "¦" => 166 ,# U+00A6 | |
| "§" => 167 ,# U+00A7 | |
| "¨" => 168 ,# U+00A8 | |
| "©" => 169 ,# U+00A9 | |
| "ª" => 170 ,# U+00AA | |
| "«" => 171 ,# U+00AB | |
| "¬" => 172 ,# U+00AC | |
| "­" => 173 ,# U+00AD | |
| "®" => 174 ,# U+00AE | |
| "¯" => 175 ,# U+00AF | |
| "°" => 176 ,# U+00B0 | |
| "±" => 177 ,# U+00B1 | |
| "²" => 178 ,# U+00B2 | |
| "³" => 179 ,# U+00B3 | |
| "´" => 180 ,# U+00B4 | |
| "µ" => 181 ,# U+00B5 | |
| "¶" => 182 ,# U+00B6 | |
| "·" => 183 ,# U+00B7 | |
| "¸" => 184 ,# U+00B8 | |
| "¹" => 185 ,# U+00B9 | |
| "º" => 186 ,# U+00BA | |
| "»" => 187 ,# U+00BB | |
| "¼" => 188 ,# U+00BC | |
| "½" => 189 ,# U+00BD | |
| "¾" => 190 ,# U+00BE | |
| "¿" => 191 ,# U+00BF | |
| "À" => 192 ,# U+00C0 | |
| "Á" => 193 ,# U+00C1 | |
| "Â" => 194 ,# U+00C2 | |
| "Ã" => 195 ,# U+00C3 | |
| "Ä" => 196 ,# U+00C4 | |
| "Å" => 197 ,# U+00C5 | |
| "Æ" => 198 ,# U+00C6 | |
| "Ç" => 199 ,# U+00C7 | |
| "È" => 200 ,# U+00C8 | |
| "É" => 201 ,# U+00C9 | |
| "Ê" => 202 ,# U+00CA | |
| "Ë" => 203 ,# U+00CB | |
| "Ì" => 204 ,# U+00CC | |
| "Í" => 205 ,# U+00CD | |
| "Î" => 206 ,# U+00CE | |
| "Ï" => 207 ,# U+00CF | |
| "Ð" => 208 ,# U+00D0 | |
| "Ñ" => 209 ,# U+00D1 | |
| "Ò" => 210 ,# U+00D2 | |
| "Ó" => 211 ,# U+00D3 | |
| "Ô" => 212 ,# U+00D4 | |
| "Õ" => 213 ,# U+00D5 | |
| "Ö" => 214 ,# U+00D6 | |
| "×" => 215 ,# U+00D7 | |
| "Ø" => 216 ,# U+00D8 | |
| "Ù" => 217 ,# U+00D9 | |
| "Ú" => 218 ,# U+00DA | |
| "Û" => 219 ,# U+00DB | |
| "Ü" => 220 ,# U+00DC | |
| "Ý" => 221 ,# U+00DD | |
| "Þ" => 222 ,# U+00DE | |
| "ß" => 223 ,# U+00DF | |
| "à" => 224 ,# U+00E0 | |
| "á" => 225 ,# U+00E1 | |
| "â" => 226 ,# U+00E2 | |
| "ã" => 227 ,# U+00E3 | |
| "ä" => 228 ,# U+00E4 | |
| "å" => 229 ,# U+00E5 | |
| "æ" => 230 ,# U+00E6 | |
| "ç" => 231 ,# U+00E7 | |
| "è" => 232 ,# U+00E8 | |
| "é" => 233 ,# U+00E9 | |
| "ê" => 234 ,# U+00EA | |
| "ë" => 235 ,# U+00EB | |
| "ì" => 236 ,# U+00EC | |
| "í" => 237 ,# U+00ED | |
| "î" => 238 ,# U+00EE | |
| "ï" => 239 ,# U+00EF | |
| "ð" => 240 ,# U+00F0 | |
| "ñ" => 241 ,# U+00F1 | |
| "ò" => 242 ,# U+00F2 | |
| "ó" => 243 ,# U+00F3 | |
| "ô" => 244 ,# U+00F4 | |
| "õ" => 245 ,# U+00F5 | |
| "ö" => 246 ,# U+00F6 | |
| "÷" => 247 ,# U+00F7 | |
| "ø" => 248 ,# U+00F8 | |
| "ù" => 249 ,# U+00F9 | |
| "ú" => 250 ,# U+00FA | |
| "û" => 251 ,# U+00FB | |
| "ü" => 252 ,# U+00FC | |
| "ý" => 253 ,# U+00FD | |
| "þ " => 254 ,# U+00FE | |
| "ÿ" => 255 ,# U+00FF | |
| """ => 34 ,# U+0022 | |
| "&" => 38 ,# U+0026 | |
| "<" => 60 ,# U+003C | |
| ">" => 62 ,# U+003E | |
| "'" => 39 # U+0027 | |
| } | |
| # common fixes (+later windows specific crap). | |
| $post_process = { | |
| 160 => 32, # U+00A0 => U+0020, => ' ' | |
| 350 => 536, # U+015E (S w sedila) => U+0218 (S w virgula) | |
| 351 => 537, # U+015F (s w sedila) => U+0219 (s w virgula) | |
| 354 => 538, # U+0162 (T w sedila) => U+021A (T w virgula) | |
| 355 => 539 # U+0163 (t w sedila) => U+021B (t w virgula) | |
| } | |
| def translate2utf8(str) | |
| $htmlenc2unicode.each{ | k, v | str.gsub!(k, [v].pack("U")) } | |
| $post_process.each{ | k, v | str.gsub!([k].pack("U"), [v].pack("U")) } | |
| str | |
| end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment