Created
August 13, 2017 05:01
-
-
Save ymattu/119e0e89dbf029d2b28028d5f66540cb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# MeCabの{Rcpp}ラッパー定義 | |
Sys.setenv("PKG_LIBS" = "-lmecab") | |
callRcppMecab <- Rcpp::cppFunction( | |
code = ' | |
Rcpp::DataFrame executeMecab(std::string str, std::string tagger_opt) { | |
using namespace Rcpp; | |
using namespace MeCab; | |
std::vector<std::string> surface, feature; | |
MeCab::Tagger *tagger = MeCab::createTagger(tagger_opt.c_str()); | |
const MeCab::Node *node(tagger->parseToNode(str.c_str())); | |
for (; node; node = node->next) { | |
if (node->stat != MECAB_BOS_NODE & node->stat != MECAB_EOS_NODE) { | |
surface.push_back(std::string(node->surface, node->length)); | |
feature.push_back(std::string(node->feature)); | |
} | |
} | |
delete tagger; | |
return Rcpp::wrap( | |
Rcpp::DataFrame::create( | |
Rcpp::Named("surface") = surface, | |
Rcpp::Named("feature") = feature | |
) | |
); | |
}', | |
includes = c("#include <mecab.h>") | |
) | |
# 形態素解析結果のうち、指定した品詞の表層のみを抽出 | |
# @param text 形態素解析にかける入力文 | |
# @param extract_pattern 抽出するパターン(正規表現) | |
mecab_wakati <- function (string, tagger_param = list(l = 2, d = NULL), extract_pattern) { | |
if (length(x = tagger_param) > 0) { | |
tagger_opt_str <- lapply( | |
X = names(x = tagger_param), | |
FUN = function (tg) { | |
if (!is.null(x = tagger_param[[tg]]) & is.element(el = tg, set = c("l", "d"))) { | |
return( | |
stringr::str_c(stringr::str_c("-", tg), tagger_param[tg], sep = " ") | |
) | |
} | |
} | |
) %>% | |
unlist %>% | |
stringr::str_c(collapse = " ") | |
} else { | |
tagger_opt_str <- "" | |
} | |
ex_surface <- callRcppMecab(str = as.character(x = string), tagger_opt = tagger_opt_str) %>% | |
tidyr::separate( | |
col = feature, | |
into = c("pos", "pos1", "pos2", "pos3", "ctype", "cform", "baseform", "orth", "pron"), | |
sep = ",", fill = "right" | |
) %>% | |
dplyr::filter(stringr::str_detect(string = .$pos, pattern = extract_pattern)) %>% | |
dplyr::mutate_if(.tbl = ., .predicate = is.factor, .funs = as.character) %>% | |
dplyr::select(surface) | |
if (nrow(x = ex_surface) < 1) { | |
return("") | |
} else { | |
return( | |
dplyr::summarize(ex_surface, sentence = stringr::str_c(surface, collapse = " ")) %>% | |
.$sentence | |
) | |
} | |
} | |
extractContent(string = "私は大学へ行く", | |
tagger_param = list(d = "/usr/local/lib/mecab/dic/mecab-ipadic-neologd"), | |
extract_pattern = "名詞|形容詞") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment