Skip to content

Instantly share code, notes, and snippets.

@ymattu
Created August 13, 2017 05:01
Show Gist options
  • Save ymattu/119e0e89dbf029d2b28028d5f66540cb to your computer and use it in GitHub Desktop.
Save ymattu/119e0e89dbf029d2b28028d5f66540cb to your computer and use it in GitHub Desktop.
# MeCabの{Rcpp}ラッパー定義
Sys.setenv("PKG_LIBS" = "-lmecab")
callRcppMecab <- Rcpp::cppFunction(
code = '
Rcpp::DataFrame executeMecab(std::string str, std::string tagger_opt) {
using namespace Rcpp;
using namespace MeCab;
std::vector<std::string> surface, feature;
MeCab::Tagger *tagger = MeCab::createTagger(tagger_opt.c_str());
const MeCab::Node *node(tagger->parseToNode(str.c_str()));
for (; node; node = node->next) {
if (node->stat != MECAB_BOS_NODE & node->stat != MECAB_EOS_NODE) {
surface.push_back(std::string(node->surface, node->length));
feature.push_back(std::string(node->feature));
}
}
delete tagger;
return Rcpp::wrap(
Rcpp::DataFrame::create(
Rcpp::Named("surface") = surface,
Rcpp::Named("feature") = feature
)
);
}',
includes = c("#include <mecab.h>")
)
# 形態素解析結果のうち、指定した品詞の表層のみを抽出
# @param text 形態素解析にかける入力文
# @param extract_pattern 抽出するパターン(正規表現)
mecab_wakati <- function (string, tagger_param = list(l = 2, d = NULL), extract_pattern) {
if (length(x = tagger_param) > 0) {
tagger_opt_str <- lapply(
X = names(x = tagger_param),
FUN = function (tg) {
if (!is.null(x = tagger_param[[tg]]) & is.element(el = tg, set = c("l", "d"))) {
return(
stringr::str_c(stringr::str_c("-", tg), tagger_param[tg], sep = " ")
)
}
}
) %>%
unlist %>%
stringr::str_c(collapse = " ")
} else {
tagger_opt_str <- ""
}
ex_surface <- callRcppMecab(str = as.character(x = string), tagger_opt = tagger_opt_str) %>%
tidyr::separate(
col = feature,
into = c("pos", "pos1", "pos2", "pos3", "ctype", "cform", "baseform", "orth", "pron"),
sep = ",", fill = "right"
) %>%
dplyr::filter(stringr::str_detect(string = .$pos, pattern = extract_pattern)) %>%
dplyr::mutate_if(.tbl = ., .predicate = is.factor, .funs = as.character) %>%
dplyr::select(surface)
if (nrow(x = ex_surface) < 1) {
return("")
} else {
return(
dplyr::summarize(ex_surface, sentence = stringr::str_c(surface, collapse = " ")) %>%
.$sentence
)
}
}
extractContent(string = "私は大学へ行く",
tagger_param = list(d = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd"),
extract_pattern = "名詞|形容詞")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment