Skip to content

Instantly share code, notes, and snippets.

@hecomi
Created February 7, 2012 14:43
Show Gist options
  • Save hecomi/1759995 to your computer and use it in GitHub Desktop.
Save hecomi/1759995 to your computer and use it in GitHub Desktop.
文章をローマ字読みに変換
#include <iostream>
#include <string>
#include <map>
#include <vector>
#include <boost/shared_ptr.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/regex.hpp>
#include <boost/range/adaptor/transformed.hpp>
#include <unicode/translit.h>
#include <mecab.h>
namespace adaptors = boost::adaptors;
using adaptors::transformed;
namespace qi = boost::spirit::qi;
using qi::standard_wide::char_;
using qi::_1;
using qi::_val;
/* ------------------------------------------------------------------------- */
// 文字列 --> カタカナ
// (e.g. イースター島 --> イースタートー)
/* ------------------------------------------------------------------------- */
struct mecab_result_to_kana
{
typedef std::string result_type;
result_type operator() (const result_type& str) const {
std::vector<result_type> v;
result_type::const_iterator
first = str.begin(),
last = str.end();
qi::parse(first, last, +(char_-',')%',', v);
if (v[1] == "格助詞") {
v[8] += "@";
}
std::cout << v[8] << std::endl;
return v[8];
}
};
/* ------------------------------------------------------------------------- */
// カタカナ --> ローマ字
// (e.g. イースター島 --> i:suta:to:)
/* ------------------------------------------------------------------------- */
struct kana2yomi
{
typedef std::string result_type;
result_type operator() (const result_type& str) const {
UnicodeString input = str.c_str();
// 「ン」をマーキング
input.findAndReplace("ン", "[ン]");
// カタカナ --> Latin 変換
UErrorCode error = U_ZERO_ERROR;
boost::shared_ptr<Transliterator> t(
Transliterator::createInstance("Katakana-Latin", UTRANS_FORWARD, error)
);
t->transliterate(input);
// 伸ばす音の表記変更 + マーキングしたンをNにする + 「つ」を「q」にする
std::map<UnicodeString, UnicodeString> long_map = {
{"\u0101","a:"},
{"\u0113","i:"},
{"\u012B","u:"},
{"\u014D","e:"},
{"\u014D","o:"},
{"[n]", "N"},
{"~", "q"}
};
for (const auto& x : long_map) {
input.findAndReplace(x.first, x.second);
}
// 変換結果取得
size_t length = input.length();
char* result = new char[length + 1];
input.extract(0, length, result, "utf8");
std::cout << result << std::endl;
return result;
}
};
/* ------------------------------------------------------------------------- */
// ローマ字を Julius の voca ファイル用に整形
// (e.g. i:suta:to: --> i: s u t a: t o:)
/* ------------------------------------------------------------------------- */
struct insert_space
{
typedef std::string result_type;
result_type operator() (const result_type& str) const {
std::string result(str);
std::map<std::string, std::string> regex_map =
{
{"[aiueoNq]:?", "$0 "},
{"[^aiueoNq]{1,2}", "$0 "},
{"[^a-zN:@]", ""},
{"\\s+", " "},
};
for (const auto& x : regex_map) {
boost::regex r(x.first);
result = boost::regex_replace(result, r, x.second, boost::format_all);
}
return result;
}
};
/* ------------------------------------------------------------------------- */
// Main
/* ------------------------------------------------------------------------- */
int main(int argc, char* argv[])
{
// MeCab による形態素解析
std::string input = "三位になった";
boost::shared_ptr<MeCab::Tagger> tagger(MeCab::createTagger(""));
const MeCab::Node* node = tagger->parseToNode(input.c_str());
// 結果をコンテナに突っ込む
std::vector<std::string> features;
for (node = node->next; node->next; node = node->next) {
features.push_back(node->feature);
std::cout << node->feature << std::endl;
}
// 発音箇所だけ取り出す
std::string s;
for (
const std::string& x
: features | transformed(mecab_result_to_kana())
| transformed(kana2yomi())
| transformed(insert_space())
) {
s += x;
}
std::cout << s << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment