Created
February 12, 2020 21:54
-
-
Save jmcastagnetto/3b0776f7558621e5d06a2a0981b20c2e to your computer and use it in GitHub Desktop.
Stemming with SnowballC vs hunspell
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #ref: https://github.com/juliasilge/tidytext/issues/17 | |
| library(dplyr) | |
| library(hunspell) | |
| library(SnowballC) | |
| w <- tibble( | |
| palabras = c( | |
| "celebra", | |
| "celebré", | |
| "celebraré", | |
| "celebramos", | |
| "celebrado", | |
| "celebraremos", | |
| "celebró" | |
| ) | |
| ) | |
| # stemming Spanish with SnowballC | |
| df1 <- w %>% | |
| mutate(stem = wordStem(palabras, language = "spanish")) | |
| df1 | |
| # A tibble: 7 x 2 | |
| # palabras stem | |
| # <chr> <chr> | |
| # 1 celebra celebr | |
| # 2 celebré celebr | |
| # 3 celebraré celebr | |
| # 4 celebramos celebr | |
| # 5 celebrado celebr | |
| # 6 celebraremos celebr | |
| # 7 celebró celebr | |
| # and now with hunspell | |
| df2 <- w %>% | |
| mutate( | |
| stem = hunspell_stem(palabras, dict = dictionary("es_PE")) | |
| ) %>% | |
| unnest(stem) | |
| df2 | |
| # A tibble: 8 x 2 | |
| # palabras stem | |
| # <chr> <chr> | |
| # 1 celebra celebrar | |
| # 2 celebré celebrar | |
| # 3 celebraré celebrar | |
| # 4 celebramos celebrar | |
| # 5 celebrado celebrado | |
| # 6 celebrado celebrar | |
| # 7 celebraremos celebrar | |
| # 8 celebró celebrar | |
| # Somehow hunspell makes a bit more sense |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment