Last active
June 17, 2020 03:50
-
-
Save simon2016bht/e51f5940ff931434c4fdd7bda719d3df to your computer and use it in GitHub Desktop.
tag_HSK_level.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.1" | |
}, | |
"colab": { | |
"name": "tag_HSK_level.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/simon2016bht/e51f5940ff931434c4fdd7bda719d3df/tag_hsk_level-1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tx9nlTGBFdOh", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 266 | |
}, | |
"outputId": "c6ec94fd-f658-4f2d-fafb-a56f87fb7c93" | |
}, | |
"source": [ | |
"print(\"Input some Chinese text:\")\n", | |
"text=input()\n", | |
"# text='一家人在吃饭。 儿子问:“爸爸,虫子能吃吗?” 爸爸说:“儿子, 妈妈做的饭菜好吃吗?” 儿子说:“很好吃!” 爸爸说:“那么,你好好吃饭。 吃饭的时候,不要说话。好吗?” 儿子说:“好的。”'\n", | |
"# text='在一个商店,他看到一只小猫,对不起,大家到了清华大学'\n", | |
"# print(text)\n", | |
"# ==============================\n", | |
"\n", | |
"!pip install wget termcolor jieba\n", | |
"import json\n", | |
"import termcolor\n", | |
"import jieba\n", | |
"import wget\n", | |
"from pathlib import Path\n", | |
"import shutil\n", | |
"import os.path\n", | |
"\n", | |
"# check if the HSK files exist \n", | |
"Path(\"./assets\").mkdir(parents=True, exist_ok=True)\n", | |
"if not os.path.exists('assets/hsk-level-1.json'):\n", | |
" wget.download('https://raw.githubusercontent.com/simon2016bht/TagHskWords/master/assets/hsk-level-1.json')\n", | |
" shutil.move('./hsk-level-1.json', 'assets/hsk-level-1.json')\n", | |
"if not os.path.exists('assets/hsk-level-2.json'):\n", | |
" wget.download('https://raw.githubusercontent.com/simon2016bht/TagHskWords/master/assets/hsk-level-2.json')\n", | |
" shutil.move('./hsk-level-2.json', 'assets/hsk-level-2.json')\n", | |
"if not os.path.exists('assets/hsk-level-3.json'):\n", | |
" wget.download('https://raw.githubusercontent.com/simon2016bht/TagHskWords/master/assets/hsk-level-3.json')\n", | |
" shutil.move('./hsk-level-3.json', 'assets/hsk-level-3.json')\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"\n", | |
"# load file contents into lists\n", | |
"with open('assets/hsk-level-1.json') as file:\n", | |
" hsk1_data = json.load(file)\n", | |
"hsk1_words = []\n", | |
"for item in hsk1_data:\n", | |
" hsk1_words.append(item['hanzi'])\n", | |
"\n", | |
"with open('assets/hsk-level-2.json') as file:\n", | |
" hsk2_data = json.load(file)\n", | |
"hsk2_words = []\n", | |
"for item in hsk2_data:\n", | |
" hsk2_words.append(item['hanzi'])\n", | |
"\n", | |
"with open('assets/hsk-level-3.json') as file:\n", | |
" hsk3_data = json.load(file)\n", | |
"hsk3_words = []\n", | |
"for item in hsk3_data:\n", | |
" hsk3_words.append(item['hanzi'])\n", | |
"# =================================\n", | |
"\n", | |
"# tag words which is of HSK 1,2,3\n", | |
"tagged_words_hsk1=[]\n", | |
"tagged_words_hsk2=[]\n", | |
"tagged_words_hsk3=[]\n", | |
"\n", | |
"# using jieba for word segmentation\n", | |
"for word in jieba.cut(text, cut_all=False):\n", | |
" #cut word in small pieces\n", | |
"# print(word,len(word))\n", | |
" # for each word output from jieba, check the subset of it\n", | |
" subset_of_word=[]\n", | |
" if len(word) >= 4:\n", | |
"# print(word,4)\n", | |
" subset_of_word.append(word[0])\n", | |
" subset_of_word.append(word[1])\n", | |
" subset_of_word.append(word[2])\n", | |
" subset_of_word.append(word[3])\n", | |
" subset_of_word.append(word[0:2])\n", | |
" subset_of_word.append(word[1:3])\n", | |
" subset_of_word.append(word[2:4])\n", | |
" subset_of_word.append(word[0:3])\n", | |
" subset_of_word.append(word[1:4])\n", | |
" elif len(word) >= 3:\n", | |
"# print(word,3)\n", | |
" subset_of_word.append(word[0])\n", | |
" subset_of_word.append(word[1])\n", | |
" subset_of_word.append(word[2])\n", | |
" subset_of_word.append(word[0:2])\n", | |
" subset_of_word.append(word[1:3])\n", | |
" elif len(word)>=2:\n", | |
"# print(word,2)\n", | |
" subset_of_word.append(word[0])\n", | |
" subset_of_word.append(word[1])\n", | |
"\n", | |
"# check the word directly from jieba \n", | |
" if word in hsk1_words and word not in tagged_words_hsk1:\n", | |
" tagged_words_hsk1.append(word)\n", | |
" elif word in hsk2_words and word not in tagged_words_hsk2:\n", | |
" tagged_words_hsk2.append(word)\n", | |
" elif word in hsk3_words and word not in tagged_words_hsk3:\n", | |
" tagged_words_hsk3.append(word)\n", | |
" \n", | |
" \n", | |
"# also check subset of the word \n", | |
" for i in subset_of_word:\n", | |
"# print(i)\n", | |
" if i in hsk1_words and i not in tagged_words_hsk1:\n", | |
" tagged_words_hsk1.append(i)\n", | |
" if i in hsk2_words and i not in tagged_words_hsk2:\n", | |
" tagged_words_hsk2.append(i)\n", | |
" if i in hsk3_words and i not in tagged_words_hsk3:\n", | |
" tagged_words_hsk3.append(i)\n", | |
"# print(\"=======================\")\n", | |
"# print(\"HSK1:\",tagged_words_hsk1)\n", | |
"# print(\"HSK2:\",tagged_words_hsk2)\n", | |
"# print(\"HSK3:\",tagged_words_hsk3)\n", | |
"\n", | |
"# ====================================\n", | |
"\n", | |
"\n", | |
"# Create list of flags for each HSK level\n", | |
"\n", | |
"# initialize flag as list of 0\n", | |
"hsk1_flag=[0]*len(text)\n", | |
"hsk2_flag=[0]*len(text)\n", | |
"hsk3_flag=[0]*len(text)\n", | |
"\n", | |
"\n", | |
"## flag a slice of list according to the length of the HSK word\n", | |
"def tag(flag_list_name,starting_position, length, hsk_level):\n", | |
" for i in range(length):\n", | |
" flag_list_name[starting_position+i]=hsk_level\n", | |
" None\n", | |
"\n", | |
"# going through the text\n", | |
"for cursor_position in enumerate(text):\n", | |
" # test word from one syllable to 4 syllables, flag of longer word will override short word in the same level\n", | |
" window=text[cursor_position[0]:cursor_position[0]+4]\n", | |
" # check if the word size is as expected; avoid out of range problems at the end of the text\n", | |
" if len(window) != 4:\n", | |
" None\n", | |
" elif window in tagged_words_hsk1:\n", | |
" tag(hsk1_flag,cursor_position[0],4,1)\n", | |
" elif window in tagged_words_hsk2:\n", | |
" tag(hsk2_flag,cursor_position[0],4,2)\n", | |
" elif window in tagged_words_hsk3:\n", | |
" tag(hsk3_flag,cursor_position[0],4,3)\n", | |
" \n", | |
" window=text[cursor_position[0]:cursor_position[0]+3] \n", | |
" if len(window) != 3:\n", | |
" None \n", | |
" elif window in tagged_words_hsk1:\n", | |
"# print(window) \n", | |
" tag(hsk1_flag,cursor_position[0],3,1)\n", | |
" elif window in tagged_words_hsk2:\n", | |
" tag(hsk2_flag,cursor_position[0],3,2)\n", | |
" elif window in tagged_words_hsk3:\n", | |
" tag(hsk3_flag,cursor_position[0],3,3)\n", | |
"\n", | |
" window=text[cursor_position[0]:cursor_position[0]+2]\n", | |
" if len(window) != 2:\n", | |
" None\n", | |
" elif window in tagged_words_hsk1:\n", | |
"# print(window) \n", | |
" tag(hsk1_flag,cursor_position[0],2,1)\n", | |
" elif window in tagged_words_hsk2:\n", | |
" tag(hsk2_flag,cursor_position[0],2,2)\n", | |
" elif window in tagged_words_hsk3:\n", | |
" tag(hsk3_flag,cursor_position[0],2,3)\n", | |
"\n", | |
" window=text[cursor_position[0]:cursor_position[0]+1] \n", | |
" if window in tagged_words_hsk1:\n", | |
" tag(hsk1_flag,cursor_position[0],1,1)\n", | |
" elif window in tagged_words_hsk2:\n", | |
" tag(hsk2_flag,cursor_position[0],1,2)\n", | |
" elif window in tagged_words_hsk3:\n", | |
" tag(hsk3_flag,cursor_position[0],1,3)\n", | |
"\n", | |
"\n", | |
"# # check tagging result for each HSK level\n", | |
"# for i in enumerate(text):\n", | |
"# print(i[0],text[i[0]],hsk1_flag[i[0]], hsk2_flag[i[0]], hsk3_flag[i[0]])\n", | |
"\n", | |
"\n", | |
"# ======================================\n", | |
"\n", | |
"## combine flags and assign font color and background color to each character\n", | |
"# Available text colors: red, green, yellow, blue, magenta, cyan, white.\n", | |
"HSK1_color = 'red'\n", | |
"HSK2_color = 'green'\n", | |
"HSK3_color = 'yellow'\n", | |
"\n", | |
"combined_flag = []\n", | |
"for i in enumerate(text):\n", | |
" d = {'character':text[i[0]],'font_color':None, 'bg_color':None}\n", | |
" combined_flag.append(d)\n", | |
"\n", | |
"for (cursor_position,character) in enumerate(text):\n", | |
"# print(cursor_position, character,hsk1_flag[cursor_position])\n", | |
" if hsk1_flag[cursor_position] != 0:\n", | |
" combined_flag[cursor_position]['font_color'] = HSK1_color\n", | |
" # for higher HSK level word, first check if it is already tagged. If so, using background color.\n", | |
" if hsk2_flag[cursor_position] != 0:\n", | |
" if combined_flag[cursor_position]['font_color'] == None:\n", | |
" combined_flag[cursor_position]['font_color'] = HSK2_color\n", | |
" elif combined_flag[cursor_position]['bg_color'] == None:\n", | |
" combined_flag[cursor_position]['bg_color'] = 'on_' + HSK2_color\n", | |
"\n", | |
" if hsk3_flag[cursor_position] != 0:\n", | |
" if combined_flag[cursor_position]['font_color'] == None:\n", | |
" combined_flag[cursor_position]['font_color'] = HSK3_color\n", | |
" elif combined_flag[cursor_position]['bg_color'] == None:\n", | |
" combined_flag[cursor_position]['bg_color'] = 'on_' + HSK3_color\n", | |
" \n", | |
"# =======================================\n", | |
"\n", | |
"\n", | |
"# output text according to the combined flag\n", | |
"print(\"Colored text (red for HSK1, green for HSK2, yellow for HSK3):\\n---\")\n", | |
"for i in enumerate(text):\n", | |
"# print(i,combined_flag[i[0]]['font_color'], combined_flag[i[0]]['bg_color'])\n", | |
" colored_word = termcolor.colored(i[1], color=combined_flag[i[0]]['font_color'], on_color=combined_flag[i[0]]['bg_color'])\n", | |
" print(colored_word, end=\"\")\n", | |
"print(\"\\n---\")\n", | |
"print('HSK3 words:', tagged_words_hsk3)\n", | |
"print('HSK2 words:', tagged_words_hsk2)\n", | |
"print('HSK1 words:', tagged_words_hsk1)" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Input some Chinese text:\n", | |
"我说爸爸的手表可不可以跑慢一点 给他更多的时间可以多看一点, 早上可以晚一点,起床多睡一点, (少累一点)我只希望这一点一点 我说爸爸的手表可不可以跑慢一点 给他更多的时间到处多转一些, 可以少担一点,我的心放松一点 我只希望这一点一点\n", | |
"Requirement already satisfied: wget in /usr/local/lib/python3.6/dist-packages (3.2)\n", | |
"Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (1.1.0)\n", | |
"Requirement already satisfied: jieba in /usr/local/lib/python3.6/dist-packages (0.42.1)\n", | |
"Colored text (red for HSK1, green for HSK2, yellow for HSK3):\n", | |
"---\n", | |
"\u001b[31m我\u001b[0m\u001b[31m说\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m的\u001b[0m\u001b[32m手\u001b[0m\u001b[32m表\u001b[0m可\u001b[0m\u001b[31m不\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[32m跑\u001b[0m\u001b[32m慢\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[32m给\u001b[0m\u001b[31m他\u001b[0m\u001b[33m更\u001b[0m\u001b[31m多\u001b[0m\u001b[31m的\u001b[0m\u001b[42m\u001b[31m时\u001b[0m\u001b[32m间\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[31m多\u001b[0m\u001b[31m看\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m \u001b[0m\u001b[32m早\u001b[0m\u001b[42m\u001b[31m上\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[32m晚\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m\u001b[32m起\u001b[0m\u001b[32m床\u001b[0m\u001b[31m多\u001b[0m\u001b[31m睡\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m \u001b[0m(\u001b[0m\u001b[31m少\u001b[0m\u001b[32m累\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m)\u001b[0m\u001b[31m我\u001b[0m\u001b[33m只\u001b[0m\u001b[32m希\u001b[0m\u001b[32m望\u001b[0m\u001b[31m这\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[31m我\u001b[0m\u001b[31m说\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m的\u001b[0m\u001b[32m手\u001b[0m\u001b[32m表\u001b[0m可\u001b[0m\u001b[31m不\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[32m跑\u001b[0m\u001b[32m慢\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[32m给\u001b[0m\u001b[31m他\u001b[0m\u001b[33m更\u001b[0m\u001b[31m多\u001b[0m\u001b[31m的\u001b[0m\u001b[42m\u001b[31m时\u001b[0m\u001b[32m间\u001b[0m\u001b[32m到\u001b[0m处\u001b[0m\u001b[31m多\u001b[0m转\u001b[0m\u001b[31m一\u001b[0m\u001b[31m些\u001b[0m,\u001b[0m \u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[31m少\u001b[0m担\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m\u001b[31m我\u001b[0m\u001b[31m的\u001b[0m心\u001b[0m\u001b[33m放\u001b[0m松\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[31m我\u001b[0m\u001b[33m只\u001b[0m\u001b[32m希\u001b[0m\u001b[32m望\u001b[0m\u001b[31m这\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m\n", | |
"---\n", | |
"HSK3 words: ['更', '只', '放']\n", | |
"HSK2 words: ['手表', '手', '可以', '跑', '慢', '给', '时间', '间', '早上', '早', '晚', '起床', '起', '累', '希望', '到']\n", | |
"HSK1 words: ['我', '说', '爸爸', '的', '不', '一', '点', '他', '多', '时', '看', '上', '睡', '少', '这', '些']\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Bzw1nausIT0t", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "uB95a01iIUWB", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment