Last active
July 23, 2017 00:04
-
-
Save northface/ee3ecc95f4c320e663e76d3daaa34d9a to your computer and use it in GitHub Desktop.
濁点「゛」・半濁点「゜」のある文字が一文字として扱われない時に置換するための方法(例:「ば」(1文字)→「は゛」(2文字)のようなケース)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"toc": "true" | |
}, | |
"source": [ | |
"# Table of Contents\n", | |
" <p><div class=\"lev1 toc-item\"><a href=\"#事前調査\" data-toc-modified-id=\"事前調査-1\"><span class=\"toc-item-num\">1 </span>事前調査</a></div><div class=\"lev1 toc-item\"><a href=\"#置換辞書作成\" data-toc-modified-id=\"置換辞書作成-2\"><span class=\"toc-item-num\">2 </span>置換辞書作成</a></div><div class=\"lev1 toc-item\"><a href=\"#ファイル内の置換\" data-toc-modified-id=\"ファイル内の置換-3\"><span class=\"toc-item-num\">3 </span>ファイル内の置換</a></div><div class=\"lev2 toc-item\"><a href=\"#テキストファイルの場合\" data-toc-modified-id=\"テキストファイルの場合-31\"><span class=\"toc-item-num\">3.1 </span>テキストファイルの場合</a></div><div class=\"lev2 toc-item\"><a href=\"#Excelファイルの場合\" data-toc-modified-id=\"Excelファイルの場合-32\"><span class=\"toc-item-num\">3.2 </span>Excelファイルの場合</a></div>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 事前調査" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"UTF-8\n" | |
] | |
} | |
], | |
"source": [ | |
"import sys\n", | |
"print(sys.stdout.encoding)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# -*- coding: utf-8 -*- \n", | |
"t= \"テ\"\n", | |
"t1 = \"デ1\" #濁点が分かれていないっぽいやつ \n", | |
"t2 = \"デ2\" #濁点が分かれているっぽいやつ\n", | |
"h=\"ヘ\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x83\\x86'" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x83\\x871'" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t1.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x83\\x86\\xe3\\x82\\x992'" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"t2.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"bytes" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"type(t1.encode('utf-8'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'デ'" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b'\\xe3\\x83\\x87'.decode()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'デ'" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b'\\xe3\\x83\\x86\\xe3\\x82\\x99'.decode()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'テ'" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b'\\xe3\\x83\\x86'.decode()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'゙'" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b'\\xe3\\x82\\x99'.decode()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x83\\x98'" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"h.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x81\\x86\\xe3\\x82\\x9b'" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'う゛'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x81\\x86'" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'う'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'ゔ'" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b'\\xe3\\x81\\x86\\xe3\\x82\\x99'.decode() #「う」に点々" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x82\\xa6'" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'ウ'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x83\\xb4'" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'ヴ'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'ヴ'" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"b'\\xe3\\x82\\xa6\\xe3\\x82\\x99'.decode()#「ウ」に点々" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 置換辞書作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#参考[はてなの鴨澤 2015-10-15 Unicodeでは濁点や半濁点を別扱いしてることがあるので結合した]\n", | |
"#http://d.hatena.ne.jp/kamosawa/20151015\n", | |
"#「c +'\\u309a'」:cに入ってる文字そのものと半濁点('\\u309a')を足した合成文字\n", | |
"#「chr(ord(c)+2)」:ord(c)でcのコードポイントを数値にし、それに2を足すことで半濁点付きの文字を1文字で表現した文字\n", | |
"#「chr(ord(c)+1)」:ord(c)でcのコードポイントを数値にし、それに1を足すことで濁点付きの文字を1文字で表現した文字\n", | |
"\n", | |
"repdict=dict()\n", | |
"for tap in [(c +'\\u309a' , chr(ord(c)+2)) for c in u'はひふへほハヒフヘホ']:\n", | |
" repdict.update({tap[0]:tap[1]})\n", | |
"for tap in [(chr(ord(c)) +'\\u3099' , chr(ord(c)+1)) for c in u'かきくけこさしすせそたちつてとはひふへほカキクケコサシスセソタチツテトハヒフヘホ']:\n", | |
" repdict.update({tap[0]:tap[1]})\n", | |
"\n", | |
"#ウとヴに対応\n", | |
"repdict.update({b'\\xe3\\x82\\xa6\\xe3\\x82\\x99'.decode():'ヴ'})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'が': 'が',\n", | |
" 'ぎ': 'ぎ',\n", | |
" 'ぐ': 'ぐ',\n", | |
" 'げ': 'げ',\n", | |
" 'ご': 'ご',\n", | |
" 'ざ': 'ざ',\n", | |
" 'じ': 'じ',\n", | |
" 'ず': 'ず',\n", | |
" 'ぜ': 'ぜ',\n", | |
" 'ぞ': 'ぞ',\n", | |
" 'だ': 'だ',\n", | |
" 'ぢ': 'ぢ',\n", | |
" 'づ': 'づ',\n", | |
" 'で': 'で',\n", | |
" 'ど': 'ど',\n", | |
" 'ば': 'ば',\n", | |
" 'ぱ': 'ぱ',\n", | |
" 'び': 'び',\n", | |
" 'ぴ': 'ぴ',\n", | |
" 'ぶ': 'ぶ',\n", | |
" 'ぷ': 'ぷ',\n", | |
" 'べ': 'べ',\n", | |
" 'ぺ': 'ぺ',\n", | |
" 'ぼ': 'ぼ',\n", | |
" 'ぽ': 'ぽ',\n", | |
" 'ヴ': 'ヴ',\n", | |
" 'ガ': 'ガ',\n", | |
" 'ギ': 'ギ',\n", | |
" 'グ': 'グ',\n", | |
" 'ゲ': 'ゲ',\n", | |
" 'ゴ': 'ゴ',\n", | |
" 'ザ': 'ザ',\n", | |
" 'ジ': 'ジ',\n", | |
" 'ズ': 'ズ',\n", | |
" 'ゼ': 'ゼ',\n", | |
" 'ゾ': 'ゾ',\n", | |
" 'ダ': 'ダ',\n", | |
" 'ヂ': 'ヂ',\n", | |
" 'ヅ': 'ヅ',\n", | |
" 'デ': 'デ',\n", | |
" 'ド': 'ド',\n", | |
" 'バ': 'バ',\n", | |
" 'パ': 'パ',\n", | |
" 'ビ': 'ビ',\n", | |
" 'ピ': 'ピ',\n", | |
" 'ブ': 'ブ',\n", | |
" 'プ': 'プ',\n", | |
" 'ベ': 'ベ',\n", | |
" 'ペ': 'ペ',\n", | |
" 'ボ': 'ボ',\n", | |
" 'ポ': 'ポ'}" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"repdict" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# ファイル内の置換" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## テキストファイルの場合" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#テキストの場合\n", | |
"f=open('dakutensample.txt')\n", | |
"contents=f.read()\n", | |
"f.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'ゼロからはじめるデータサイエンス_oreilly-978-4-87311-786-7e.pdf\\n• ユーザーは定期的にオンライン状態のデータ同期を実行することにより、オフライン状態でもデータの登録・編集を含めたシステムの利用が可能とする \\n• PC環境はWebアプリケーションを前提とするが、クライアント側アプリケーションのインストールが必要な場合は明記すること '" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"contents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for key in repdict.keys():\n", | |
" contents=contents.replace(key, repdict.get(key))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"w=open('dakutenresults.txt', 'w')\n", | |
"w.write(contents)\n", | |
"w.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'ゼロからはじめるデータサイエンス_oreilly-978-4-87311-786-7e.pdf\\n• ユーザーは定期的にオンライン状態のデータ同期を実行することにより、オフライン状態でもデータの登録・編集を含めたシステムの利用が可能とする \\n• PC環境はWebアプリケーションを前提とするが、クライアント側アプリケーションのインストールが必要な場合は明記すること '" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"contents" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"実際に確かめてみる" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x82\\xb5\\xe3\\x82\\x99'" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'ザ'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x82\\xb6'" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'ザ'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Excelファイルの場合" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from openpyxl import load_workbook" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def henkan(ws):\n", | |
" col = list(ws.columns)\n", | |
" row = list(ws.rows)\n", | |
"\n", | |
" for i in range(len(col)):\n", | |
" for j in range(len(row)):\n", | |
" if isinstance(col[i][j].value,type(None)):\n", | |
" continue\n", | |
" for key in repdict.keys():\n", | |
" col[i][j].value = col[i][j].value.replace(key, repdict.get(key))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"wb = load_workbook(filename = \"excelsample.xlsx\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"for i in wb.sheetnames:\n", | |
" ws = wb[i]\n", | |
" henkan(ws)\n", | |
"\n", | |
"wb.save(filename = 'results.xlsx')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x83\\x90'" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'バ'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"b'\\xe3\\x83\\x9d'" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'ポ'.encode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.0" | |
}, | |
"toc": { | |
"colors": { | |
"hover_highlight": "#DAA520", | |
"running_highlight": "#FF0000", | |
"selected_highlight": "#FFD700" | |
}, | |
"moveMenuLeft": true, | |
"nav_menu": { | |
"height": "102px", | |
"width": "252px" | |
}, | |
"navigate_menu": true, | |
"number_sections": true, | |
"sideBar": true, | |
"threshold": 4, | |
"toc_cell": true, | |
"toc_section_display": "block", | |
"toc_window_display": true, | |
"widenNotebook": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment