Last active
October 30, 2022 12:34
-
-
Save NeroHin/f65214266f3a35027a381fb2832467e6 to your computer and use it in GitHub Desktop.
regex_sub_python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 綜合題" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"91.8>92>92.3( .)\n", | |
"91.8>92>92.3\n", | |
"91.8\n" | |
] | |
} | |
], | |
"source": [ | |
"all_mix = '91.8>92>92.3(by Dr.林)'\n", | |
"\n", | |
"# 先去除中英文混合的情況\n", | |
"del_eng_ch = re.sub('[a-zA-Z\\u4e00-\\u9fff]+', '', all_mix)\n", | |
"print(del_eng_ch)\n", | |
"\n", | |
"# output: 91.8>92>92.3( .)\n", | |
"\n", | |
"# 去除 () 內的內容\n", | |
"del_bracket = re.sub('\\(.*?\\)', '', del_eng_ch)\n", | |
"print(del_bracket)\n", | |
"\n", | |
"# output: 91.8>92>92.3\n", | |
"\n", | |
"# 去除 > 內的內容\n", | |
"print(del_bracket.split('>')[0])\n", | |
"\n", | |
"# output: 91.8" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 處理英文" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"只處理英文\n", | |
"透析中止( .林)\n", | |
"處理()裡的內容\n", | |
"透析中止\n" | |
] | |
} | |
], | |
"source": [ | |
"ch_symbol = '透析中止(by Dr.林)'\n", | |
"print('只處理英文')\n", | |
"print(re.sub(r'[a-zA-Z]', '', ch_symbol))\n", | |
"\n", | |
"# output : 透析中止( .林)\n", | |
"\n", | |
"print('處理()裡的內容')\n", | |
"print(re.sub(r'[a-zA-Z]', '', ch_symbol).strip().split('(')[0])\n", | |
"\n", | |
"# output : 透析中止\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 處理中文加數字" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"處理中文加數字\n", | |
"55.5改為55.6\n", | |
"先將中文替換成符號\n", | |
"55.5>55.6\n", | |
"再將符號切割\n", | |
"55.6\n" | |
] | |
} | |
], | |
"source": [ | |
"ch_symbol = '55.5改為55.6'\n", | |
"\n", | |
"print('處理中文加數字')\n", | |
"print(ch_symbol)\n", | |
"print('先將中文替換成符號')\n", | |
"print(re.sub(r'[\\u4e00-\\u9fff]+','>', ch_symbol))\n", | |
"\n", | |
"# output: 55.5>55.6\n", | |
"\n", | |
"print('再將符號切割')\n", | |
"print(re.sub(r'[\\u4e00-\\u9fff]+','>', ch_symbol).split('>')[-1])\n", | |
"\n", | |
"# output: 55.6" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 處理中英數混合例子" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"全部去掉\n", | |
"54.3\n", | |
"保留數字和指定英文 kg\n", | |
"54.3kg\n" | |
] | |
} | |
], | |
"source": [ | |
"ch_eng = 'TRY改為54.3'\n", | |
"ch_eng_2 = 'TRY改為54.3kg'\n", | |
"\n", | |
"# 全部去掉\n", | |
"print('全部去掉')\n", | |
"print(re.sub(r'[^0-9.]', '', ch_eng)) \n", | |
"\n", | |
"# output: 54.3\n", | |
" \n", | |
"# 保留數字\n", | |
"print('保留數字和指定英文 kg')\n", | |
"print(re.sub(r'[^0-9kg.]', '', ch_eng_2))\n", | |
"\n", | |
"# output: 54.3kg" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 去除符號和文字" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"去掉文字和符號\n", | |
"3.53.332.8\n", | |
"先去掉文字\n", | |
"3.5>3.3>3>2.8\n", | |
"再以 > 分割\n", | |
"2.8\n" | |
] | |
} | |
], | |
"source": [ | |
"num_symbol = '3.5>3.3>3>2.8(final)'\n", | |
"\n", | |
"# 去掉符號\n", | |
"print('去掉文字和符號')\n", | |
"print(re.sub(r'[^0-9.]', '', num_symbol))\n", | |
"# output: 3.53.332.8\n", | |
"\n", | |
"# 去掉符號並以 > 分割\n", | |
"print('先去掉文字')\n", | |
"split_symbol = re.sub(r'[^0-9.>]', '', num_symbol)\n", | |
"print(split_symbol)\n", | |
"\n", | |
"# output: 3.5>3.3>3>2.8\n", | |
"\n", | |
"print('再以 > 分割')\n", | |
"print(split_symbol.split('>')[-1])\n", | |
"\n", | |
"# output: 2.8\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 結合 Pandas 和 Series" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1.2>1.3>1.4(by 護理長)</td>\n", | |
" <td>1.2>1.3>1.6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1.2>1.3>1.5</td>\n", | |
" <td>1.2>1.3>1.7</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B\n", | |
"0 1.2>1.3>1.4(by 護理長) 1.2>1.3>1.6\n", | |
"1 1.2>1.3>1.5 1.2>1.3>1.7" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame({\n", | |
" 'A' : ['1.2>1.3>1.4(by 護理長)', '1.2>1.3>1.5', '1.2>1.3>1.9'],\n", | |
" 'B' : ['1.2>1.3>1.6', '1.2>1.3>1.7', '1.2>1.3>1.8(by 護理長)'],\n", | |
"})\n", | |
"\n", | |
"df.head(2)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Pandas Series" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 1.4\n", | |
"1 1.5\n", | |
"2 1.9\n", | |
"Name: A, dtype: object" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"strip_split_symbol = lambda x: re.sub(r'[^0-9.>]', '', x).split('>')[-1]\n", | |
"\n", | |
"df['A'] = df['A'].apply(strip_split_symbol)\n", | |
"\n", | |
"df.A" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Pandas DataFrame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# with dataframe\n", | |
"for col in df.columns:\n", | |
" df[col] = df[col].apply(strip_split_symbol)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1.4</td>\n", | |
" <td>1.6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1.5</td>\n", | |
" <td>1.7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1.9</td>\n", | |
" <td>1.8</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" A B\n", | |
"0 1.4 1.6\n", | |
"1 1.5 1.7\n", | |
"2 1.9 1.8" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.8.12 ('api_server')", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.12" | |
}, | |
"orig_nbformat": 4, | |
"vscode": { | |
"interpreter": { | |
"hash": "8ba975e9754e85fb30fc4fec1d82e9cb6c3031376e8f82bbcac6e0a1936ff599" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment