Created
February 7, 2012 14:43
-
-
Save hecomi/1759995 to your computer and use it in GitHub Desktop.
文章をローマ字読みに変換
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <map> | |
#include <vector> | |
#include <boost/shared_ptr.hpp> | |
#include <boost/spirit/include/qi.hpp> | |
#include <boost/regex.hpp> | |
#include <boost/range/adaptor/transformed.hpp> | |
#include <unicode/translit.h> | |
#include <mecab.h> | |
namespace adaptors = boost::adaptors; | |
using adaptors::transformed; | |
namespace qi = boost::spirit::qi; | |
using qi::standard_wide::char_; | |
using qi::_1; | |
using qi::_val; | |
/* ------------------------------------------------------------------------- */ | |
// 文字列 --> カタカナ | |
// (e.g. イースター島 --> イースタートー) | |
/* ------------------------------------------------------------------------- */ | |
struct mecab_result_to_kana | |
{ | |
typedef std::string result_type; | |
result_type operator() (const result_type& str) const { | |
std::vector<result_type> v; | |
result_type::const_iterator | |
first = str.begin(), | |
last = str.end(); | |
qi::parse(first, last, +(char_-',')%',', v); | |
if (v[1] == "格助詞") { | |
v[8] += "@"; | |
} | |
std::cout << v[8] << std::endl; | |
return v[8]; | |
} | |
}; | |
/* ------------------------------------------------------------------------- */ | |
// カタカナ --> ローマ字 | |
// (e.g. イースター島 --> i:suta:to:) | |
/* ------------------------------------------------------------------------- */ | |
struct kana2yomi | |
{ | |
typedef std::string result_type; | |
result_type operator() (const result_type& str) const { | |
UnicodeString input = str.c_str(); | |
// 「ン」をマーキング | |
input.findAndReplace("ン", "[ン]"); | |
// カタカナ --> Latin 変換 | |
UErrorCode error = U_ZERO_ERROR; | |
boost::shared_ptr<Transliterator> t( | |
Transliterator::createInstance("Katakana-Latin", UTRANS_FORWARD, error) | |
); | |
t->transliterate(input); | |
// 伸ばす音の表記変更 + マーキングしたンをNにする + 「つ」を「q」にする | |
std::map<UnicodeString, UnicodeString> long_map = { | |
{"\u0101","a:"}, | |
{"\u0113","i:"}, | |
{"\u012B","u:"}, | |
{"\u014D","e:"}, | |
{"\u014D","o:"}, | |
{"[n]", "N"}, | |
{"~", "q"} | |
}; | |
for (const auto& x : long_map) { | |
input.findAndReplace(x.first, x.second); | |
} | |
// 変換結果取得 | |
size_t length = input.length(); | |
char* result = new char[length + 1]; | |
input.extract(0, length, result, "utf8"); | |
std::cout << result << std::endl; | |
return result; | |
} | |
}; | |
/* ------------------------------------------------------------------------- */ | |
// ローマ字を Julius の voca ファイル用に整形 | |
// (e.g. i:suta:to: --> i: s u t a: t o:) | |
/* ------------------------------------------------------------------------- */ | |
struct insert_space | |
{ | |
typedef std::string result_type; | |
result_type operator() (const result_type& str) const { | |
std::string result(str); | |
std::map<std::string, std::string> regex_map = | |
{ | |
{"[aiueoNq]:?", "$0 "}, | |
{"[^aiueoNq]{1,2}", "$0 "}, | |
{"[^a-zN:@]", ""}, | |
{"\\s+", " "}, | |
}; | |
for (const auto& x : regex_map) { | |
boost::regex r(x.first); | |
result = boost::regex_replace(result, r, x.second, boost::format_all); | |
} | |
return result; | |
} | |
}; | |
/* ------------------------------------------------------------------------- */ | |
// Main | |
/* ------------------------------------------------------------------------- */ | |
int main(int argc, char* argv[]) | |
{ | |
// MeCab による形態素解析 | |
std::string input = "三位になった"; | |
boost::shared_ptr<MeCab::Tagger> tagger(MeCab::createTagger("")); | |
const MeCab::Node* node = tagger->parseToNode(input.c_str()); | |
// 結果をコンテナに突っ込む | |
std::vector<std::string> features; | |
for (node = node->next; node->next; node = node->next) { | |
features.push_back(node->feature); | |
std::cout << node->feature << std::endl; | |
} | |
// 発音箇所だけ取り出す | |
std::string s; | |
for ( | |
const std::string& x | |
: features | transformed(mecab_result_to_kana()) | |
| transformed(kana2yomi()) | |
| transformed(insert_space()) | |
) { | |
s += x; | |
} | |
std::cout << s << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment