Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Beomi/cc87708b978a4f4ed87793762c019866 to your computer and use it in GitHub Desktop.
Save Beomi/cc87708b978a4f4ed87793762c019866 to your computer and use it in GitHub Desktop.
2021.03.15. KcBERT MLM Finetune with Petition Dataset
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "2021.03.15. KcBERT MLM Finetune with Petition Dataset",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"machine_shape": "hm",
"authorship_tag": "ABX9TyPkJnRAn/rZZxXG+EirhdSe",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"9faa5adcf806489793a7d149b1b42b65": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_aa687d20633e4f4983e8050973f179d6",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_a8fd2c7b26f94d3b92f8907c2107ff55",
"IPY_MODEL_05b38759833a41d3bfb17475d3ad7f01",
"IPY_MODEL_2c9648eda92f477f83700bd0db7e473c"
]
}
},
"aa687d20633e4f4983e8050973f179d6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"a8fd2c7b26f94d3b92f8907c2107ff55": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_98872de6b08d42329de1d89dac042d33",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_77b77cb3df4a42cf9ef7e2226d800121"
}
},
"05b38759833a41d3bfb17475d3ad7f01": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_0bf215884d134117acd1ab1bcf87d4b7",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 20,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 20,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_96830e13310f444bb887c797f95b0a6f"
}
},
"2c9648eda92f477f83700bd0db7e473c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_c12e47c0f60049c181b6eeac324e7c0d",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 20/20 [00:09<00:00, 2.55it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_089f6882066b46eaa9cc92fd043db7d9"
}
},
"98872de6b08d42329de1d89dac042d33": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"77b77cb3df4a42cf9ef7e2226d800121": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"0bf215884d134117acd1ab1bcf87d4b7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"96830e13310f444bb887c797f95b0a6f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c12e47c0f60049c181b6eeac324e7c0d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"089f6882066b46eaa9cc92fd043db7d9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7a0d1e56284b417c8daff6471e4dee81": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_e1df92b7c6eb468eb4a9c73aff0d828a",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_5796de99da2b473a8c73bfe696825dc7",
"IPY_MODEL_7f97c030c7534310a7d47dbf422f56bd",
"IPY_MODEL_930cd013a7bf45b2849b8963f8c7f7a4"
]
}
},
"e1df92b7c6eb468eb4a9c73aff0d828a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"5796de99da2b473a8c73bfe696825dc7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_5d0dd0a218ef4d75926dd84812d7f7ee",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_3ebf901c3e5d4f49b7ac8e65bac8f3be"
}
},
"7f97c030c7534310a7d47dbf422f56bd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_98600149fa204fe8ae64cb288548f5c3",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 3704,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 3704,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d89775056ac8482c9b52c638215036bb"
}
},
"930cd013a7bf45b2849b8963f8c7f7a4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_b851d679ea904e4f874794aee2b9c741",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 3704/3704 [01:24<00:00, 57.08it/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_b55df5f244f744389204e6446cd74e16"
}
},
"5d0dd0a218ef4d75926dd84812d7f7ee": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"3ebf901c3e5d4f49b7ac8e65bac8f3be": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"98600149fa204fe8ae64cb288548f5c3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"d89775056ac8482c9b52c638215036bb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b851d679ea904e4f874794aee2b9c741": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"b55df5f244f744389204e6446cd74e16": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/Beomi/cc87708b978a4f4ed87793762c019866/2021-03-15-kcbert-mlm-finetune-with-petition-dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XxaDw3JIXrDf"
},
"source": [
"# 필요한 패키지 설치\n",
"\n",
"- Korpora: 데이터셋 다운로드\n",
"- emoji: 이모지코드\n",
"- soynlp: Preprocesisng\n",
"- kss: 한국어 문장 분리기\n",
"- transformers: MLM 학습 및 데이터셋\n",
" - datasets\n",
" - protobuf\n",
" - sentencepiece"
]
},
{
"cell_type": "code",
"metadata": {
"id": "lqA2SU2sWqjR"
},
"source": [
"!pip install -q Korpora emoji soynlp kss transformers \"datasets >= 1.1.3\" \"sentencepiece != 0.1.92\" protobuf"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0nAbFcVFXzV4"
},
"source": [
"# 예시용 데이터셋 다운로드\n",
"\n",
"- 여기서는 Korean petitions dataset(국민청원 데이터셋)을 사용\n",
"- 전체 중 동의 수가 1000건 초과인 본문만 사용"
]
},
{
"cell_type": "code",
"metadata": {
"id": "3Z7ji3QnW7JG"
},
"source": [
"from Korpora import Korpora"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "erTvWvZCW91J",
"outputId": "b673e35a-14f2-4acd-9a4a-f9c01bd22c42"
},
"source": [
"Korpora.fetch('korean_petitions', root_dir='./Korpora')"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-08\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-09\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-10\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-11\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-12\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-01\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-02\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-03\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-04\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-05\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-06\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-07\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-08\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-09\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-10\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-11\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-12\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-01\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-02\n",
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-03\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Gk4EXcX3W-LO"
},
"source": [
"from glob import glob"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XPQDC74FXAxb",
"outputId": "f87e9593-81c5-4104-eeed-37ad5917c290"
},
"source": [
"dataset = glob('./Korpora/korean_petitions/petitions*')\n",
"dataset"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['./Korpora/korean_petitions/petitions_2018-07',\n",
" './Korpora/korean_petitions/petitions_2019-01',\n",
" './Korpora/korean_petitions/petitions_2017-11',\n",
" './Korpora/korean_petitions/petitions_2018-01',\n",
" './Korpora/korean_petitions/petitions_2017-09',\n",
" './Korpora/korean_petitions/petitions_2018-06',\n",
" './Korpora/korean_petitions/petitions_2018-10',\n",
" './Korpora/korean_petitions/petitions_2018-11',\n",
" './Korpora/korean_petitions/petitions_2017-12',\n",
" './Korpora/korean_petitions/petitions_2019-02',\n",
" './Korpora/korean_petitions/petitions_2018-02',\n",
" './Korpora/korean_petitions/petitions_2018-08',\n",
" './Korpora/korean_petitions/petitions_2018-04',\n",
" './Korpora/korean_petitions/petitions_2019-03',\n",
" './Korpora/korean_petitions/petitions_2018-05',\n",
" './Korpora/korean_petitions/petitions_2018-09',\n",
" './Korpora/korean_petitions/petitions_2017-10',\n",
" './Korpora/korean_petitions/petitions_2017-08',\n",
" './Korpora/korean_petitions/petitions_2018-12',\n",
" './Korpora/korean_petitions/petitions_2018-03']"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ssdr7HBMX8Xo"
},
"source": [
"# 데이터 로딩\n",
"\n",
"- pandas로 `content` 부분만 읽어 파일로 만들기\n",
"- kss로 각 청원 게시글 내 문장 분리 \n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "eS_NOWeWXB-x"
},
"source": [
"import pandas as pd\n",
"from tqdm.auto import tqdm"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"9faa5adcf806489793a7d149b1b42b65",
"aa687d20633e4f4983e8050973f179d6",
"a8fd2c7b26f94d3b92f8907c2107ff55",
"05b38759833a41d3bfb17475d3ad7f01",
"2c9648eda92f477f83700bd0db7e473c",
"98872de6b08d42329de1d89dac042d33",
"77b77cb3df4a42cf9ef7e2226d800121",
"0bf215884d134117acd1ab1bcf87d4b7",
"96830e13310f444bb887c797f95b0a6f",
"c12e47c0f60049c181b6eeac324e7c0d",
"089f6882066b46eaa9cc92fd043db7d9"
]
},
"id": "oXnOcNi8XDC8",
"outputId": "369b48b0-5b1c-4378-d62c-99e2ba20c58f"
},
"source": [
"df = pd.concat([pd.read_json(i, lines=True) for i in tqdm(dataset)])"
],
"execution_count": 7,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9faa5adcf806489793a7d149b1b42b65",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
" 0%| | 0/20 [00:00<?, ?it/s]"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 289
},
"id": "yH94XSQfXEHU",
"outputId": "fb7c84c2-7307-4544-adb6-49ce19066472"
},
"source": [
"df.head()"
],
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>category</th>\n",
" <th>begin</th>\n",
" <th>end</th>\n",
" <th>content</th>\n",
" <th>num_agree</th>\n",
" <th>petition_idx</th>\n",
" <th>status</th>\n",
" <th>title</th>\n",
" <th>replies</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>일자리</td>\n",
" <td>2018-07-01</td>\n",
" <td>2018-07-31</td>\n",
" <td>포괄임금제 및 52시간 근로제 취지와 다른 악의적 적용 사례를 들어 폐지 및 보완을...</td>\n",
" <td>406</td>\n",
" <td>291662</td>\n",
" <td>청원종료</td>\n",
" <td>포괄임금제 폐지 요청 및 주52시간 관련</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>보건복지</td>\n",
" <td>2018-07-01</td>\n",
" <td>2018-07-31</td>\n",
" <td>의료사고로 의사의오진으로 병명이 바뀌어서 생명에 지장이 왔다갔다한 경우, 옮긴병원에...</td>\n",
" <td>14</td>\n",
" <td>291663</td>\n",
" <td>청원종료</td>\n",
" <td>억울한 의료 사고에 대한 내용</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>외교/통일/국방</td>\n",
" <td>2018-07-01</td>\n",
" <td>2018-07-31</td>\n",
" <td>대한민국 세계유일의 분단국가이자 휴전중인 상황에서 대한민국의 젊은 청춘들은 국방의 ...</td>\n",
" <td>62</td>\n",
" <td>291664</td>\n",
" <td>청원종료</td>\n",
" <td>형편없는 대한민국 국군 의료시스템 혁신적인 개선이 필요합니다.</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>육아/교육</td>\n",
" <td>2018-07-01</td>\n",
" <td>2018-07-31</td>\n",
" <td>어제 교육부의 2022년 수능개편안을 보았습니다. 정말 어처구니 없는 내용이었습니다...</td>\n",
" <td>12</td>\n",
" <td>291666</td>\n",
" <td>청원종료</td>\n",
" <td>수능 개편 반대합니다.</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>외교/통일/국방</td>\n",
" <td>2018-07-01</td>\n",
" <td>2018-07-31</td>\n",
" <td>국민의 4대의무중 국방의무가 있습니다 대한민국의 남자라면 대부분 국가의 부름을받습니...</td>\n",
" <td>18</td>\n",
" <td>291667</td>\n",
" <td>청원종료</td>\n",
" <td>국방의무를 다합시다. 대체복무 반대</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" category begin ... title replies\n",
"0 일자리 2018-07-01 ... 포괄임금제 폐지 요청 및 주52시간 관련 NaN\n",
"1 보건복지 2018-07-01 ... 억울한 의료 사고에 대한 내용 NaN\n",
"2 외교/통일/국방 2018-07-01 ... 형편없는 대한민국 국군 의료시스템 혁신적인 개선이 필요합니다. NaN\n",
"3 육아/교육 2018-07-01 ... 수능 개편 반대합니다. NaN\n",
"4 외교/통일/국방 2018-07-01 ... 국방의무를 다합시다. 대체복무 반대 NaN\n",
"\n",
"[5 rows x 9 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_EmRjLbOXFHX",
"outputId": "8e9eea74-7311-4841-fa1a-abf978790d1c"
},
"source": [
"len(df)"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"433631"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "PZAqRMwBXG1i"
},
"source": [
"agreed_df = df[df['num_agree'] > 1000]"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "O5eeT1UZXUUn",
"outputId": "58736a25-c0a5-4e05-8622-73a0ceacf9ca"
},
"source": [
"len(agreed_df)"
],
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"3704"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "1T8OvUwDXVM3"
},
"source": [
"import re\n",
"import emoji\n",
"from soynlp.normalizer import repeat_normalize\n",
"\n",
"emojis = ''.join(emoji.UNICODE_EMOJI.keys())\n",
"pattern = re.compile(f'[^ .,?!/@$%~%·∼()\\x00-\\x7Fㄱ-ㅣ가-힣{emojis}]+')\n",
"url_pattern = re.compile(\n",
" r'https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*)')\n",
"\n",
"def clean(x):\n",
" x = pattern.sub(' ', x)\n",
" x = url_pattern.sub('', x)\n",
" x = x.strip()\n",
" x = repeat_normalize(x, num_repeats=2)\n",
" return x"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "z59q9G4gXWVh"
},
"source": [
"contents = agreed_df['content'].map(clean).to_list()"
],
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "V-VI74f3XXV-"
},
"source": [
"from kss import split_sentences"
],
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fJtIEVBpY58Z"
},
"source": [
"import os"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 66,
"referenced_widgets": [
"7a0d1e56284b417c8daff6471e4dee81",
"e1df92b7c6eb468eb4a9c73aff0d828a",
"5796de99da2b473a8c73bfe696825dc7",
"7f97c030c7534310a7d47dbf422f56bd",
"930cd013a7bf45b2849b8963f8c7f7a4",
"5d0dd0a218ef4d75926dd84812d7f7ee",
"3ebf901c3e5d4f49b7ac8e65bac8f3be",
"98600149fa204fe8ae64cb288548f5c3",
"d89775056ac8482c9b52c638215036bb",
"b851d679ea904e4f874794aee2b9c741",
"b55df5f244f744389204e6446cd74e16"
]
},
"id": "i0RriVk2XY05",
"outputId": "fc1e1665-e4de-45d1-d1a8-31661bea2624"
},
"source": [
"if not os.path.exists('korean_petitions_safe.txt'):\n",
" with open('korean_petitions_safe.txt', 'w') as f:\n",
" for doc in tqdm(contents):\n",
" for line in split_sentences(doc, safe=True):\n",
" f.write(line+'\\n')\n",
" f.write('\\n')"
],
"execution_count": 15,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7a0d1e56284b417c8daff6471e4dee81",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
" 0%| | 0/3704 [00:00<?, ?it/s]"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"end\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9KWyr7_LYE8q"
},
"source": [
"# KcBERT-base MLM Finetune 학습하기 (GPU)\n",
"\n",
"- Huggingface Transformers에서 제공하는 `run_mlm.py` 파일을 이용해 KcBERT weight과 vocab을 이용해 MLM 학습 "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "q9Rta_FPXZ0L",
"outputId": "c457dd55-cadf-4b84-c9fc-881b80f5e61f"
},
"source": [
"!mkdir ./test-mlm"
],
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": [
"mkdir: cannot create directory ‘./test-mlm’: File exists\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bumVyJhPXcZe",
"outputId": "d3301679-1405-4055-d4b3-fae3d2535173"
},
"source": [
"!wget -nc https://raw.githubusercontent.com/huggingface/transformers/4c32f9f26e6a84f0d9843fec8757e6ce640bb44e/examples/language-modeling/run_mlm.py"
],
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"text": [
"File ‘run_mlm.py’ already there; not retrieving.\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Q5E3GLcOYs9y",
"outputId": "3601ca5d-b277-481f-92ee-078f636f3d79"
},
"source": [
"!nvidia-smi"
],
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"text": [
"Mon Mar 15 05:38:33 2021 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 460.56 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla V100-SXM2... Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 33C P0 24W / 300W | 0MiB / 16160MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ikFSD-VzXbVS",
"outputId": "c7ce54b2-68b0-4038-d430-9a6107145491"
},
"source": [
"!python run_mlm.py \\\n",
" --model_name_or_path beomi/kcbert-base \\\n",
" --train_file korean_petitions_safe.txt \\\n",
" --do_train \\\n",
" --output_dir ./test-mlm"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"2021-03-15 05:38:37.559396: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n",
"03/15/2021 05:38:38 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
"03/15/2021 05:38:38 - INFO - __main__ - Training/evaluation parameters TrainingArguments(output_dir=./test-mlm, overwrite_output_dir=False, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Mar15_05-38-38_ae41adae085d, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./test-mlm, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=False, deepspeed=None, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, _n_gpu=1)\n",
"03/15/2021 05:38:39 - WARNING - datasets.builder - Using custom data configuration default-b170ee81aef401ea\n",
"Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-b170ee81aef401ea/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57...\n",
"Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-b170ee81aef401ea/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57. Subsequent calls will reuse this data.\n",
"[INFO|file_utils.py:1302] 2021-03-15 05:38:39,663 >> https://huggingface.co/beomi/kcbert-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpn3xpel08\n",
"Downloading: 100% 619/619 [00:00<00:00, 586kB/s]\n",
"[INFO|file_utils.py:1306] 2021-03-15 05:38:39,932 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
"[INFO|file_utils.py:1309] 2021-03-15 05:38:39,933 >> creating metadata file for /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
"[INFO|configuration_utils.py:449] 2021-03-15 05:38:39,933 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
"[INFO|configuration_utils.py:485] 2021-03-15 05:38:39,933 >> Model config BertConfig {\n",
" \"architectures\": [\n",
" \"BertForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"directionality\": \"bidi\",\n",
" \"gradient_checkpointing\": false,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-12,\n",
" \"max_position_embeddings\": 300,\n",
" \"model_type\": \"bert\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 0,\n",
" \"pooler_fc_size\": 768,\n",
" \"pooler_num_attention_heads\": 12,\n",
" \"pooler_num_fc_layers\": 3,\n",
" \"pooler_size_per_head\": 128,\n",
" \"pooler_type\": \"first_token_transform\",\n",
" \"position_embedding_type\": \"absolute\",\n",
" \"transformers_version\": \"4.3.3\",\n",
" \"type_vocab_size\": 2,\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 30000\n",
"}\n",
"\n",
"[INFO|configuration_utils.py:449] 2021-03-15 05:38:40,201 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
"[INFO|configuration_utils.py:485] 2021-03-15 05:38:40,202 >> Model config BertConfig {\n",
" \"architectures\": [\n",
" \"BertForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"directionality\": \"bidi\",\n",
" \"gradient_checkpointing\": false,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-12,\n",
" \"max_position_embeddings\": 300,\n",
" \"model_type\": \"bert\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 0,\n",
" \"pooler_fc_size\": 768,\n",
" \"pooler_num_attention_heads\": 12,\n",
" \"pooler_num_fc_layers\": 3,\n",
" \"pooler_size_per_head\": 128,\n",
" \"pooler_type\": \"first_token_transform\",\n",
" \"position_embedding_type\": \"absolute\",\n",
" \"transformers_version\": \"4.3.3\",\n",
" \"type_vocab_size\": 2,\n",
" \"use_cache\": true,\n",
" \"vocab_size\": 30000\n",
"}\n",
"\n",
"[INFO|tokenization_utils_base.py:1688] 2021-03-15 05:38:40,202 >> Model name 'beomi/kcbert-base' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). Assuming 'beomi/kcbert-base' is a path, a model identifier, or url to a directory containing tokenizer files.\n",
"[INFO|file_utils.py:1302] 2021-03-15 05:38:40,472 >> https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpn5ot4e1n\n",
"Downloading: 100% 250k/250k [00:00<00:00, 736kB/s]\n",
"[INFO|file_utils.py:1306] 2021-03-15 05:38:41,081 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n",
"[INFO|file_utils.py:1309] 2021-03-15 05:38:41,081 >> creating metadata file for /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n",
"[INFO|file_utils.py:1302] 2021-03-15 05:38:42,152 >> https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpw_x2bo_z\n",
"Downloading: 100% 49.0/49.0 [00:00<00:00, 43.4kB/s]\n",
"[INFO|file_utils.py:1306] 2021-03-15 05:38:42,422 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n",
"[INFO|file_utils.py:1309] 2021-03-15 05:38:42,422 >> creating metadata file for /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n",
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n",
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/added_tokens.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/special_tokens_map.json from cache at None\n",
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,423 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n",
"[INFO|file_utils.py:1302] 2021-03-15 05:38:42,739 >> https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpv6awhvr7\n",
"Downloading: 100% 438M/438M [00:15<00:00, 28.2MB/s]\n",
"[INFO|file_utils.py:1306] 2021-03-15 05:38:58,916 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n",
"[INFO|file_utils.py:1309] 2021-03-15 05:38:58,917 >> creating metadata file for /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n",
"[INFO|modeling_utils.py:1027] 2021-03-15 05:38:58,917 >> loading weights file https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n",
"[WARNING|modeling_utils.py:1135] 2021-03-15 05:39:02,868 >> Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
"- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"[INFO|modeling_utils.py:1152] 2021-03-15 05:39:02,868 >> All the weights of BertForMaskedLM were initialized from the model checkpoint at beomi/kcbert-base.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.\n",
" 0% 0/71 [00:00<?, ?ba/s][WARNING|tokenization_utils_base.py:3213] 2021-03-15 05:39:02,958 >> Token indices sequence length is longer than the specified maximum sequence length for this model (329 > 300). Running this sequence through the model will result in indexing errors\n",
"100% 71/71 [00:03<00:00, 19.30ba/s]\n",
"100% 71/71 [00:20<00:00, 3.43ba/s]\n",
"[INFO|trainer.py:432] 2021-03-15 05:39:33,825 >> The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.\n",
"[INFO|trainer.py:837] 2021-03-15 05:39:33,828 >> ***** Running training *****\n",
"[INFO|trainer.py:838] 2021-03-15 05:39:33,828 >> Num examples = 7159\n",
"[INFO|trainer.py:839] 2021-03-15 05:39:33,828 >> Num Epochs = 3\n",
"[INFO|trainer.py:840] 2021-03-15 05:39:33,828 >> Instantaneous batch size per device = 8\n",
"[INFO|trainer.py:841] 2021-03-15 05:39:33,828 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n",
"[INFO|trainer.py:842] 2021-03-15 05:39:33,828 >> Gradient Accumulation steps = 1\n",
"[INFO|trainer.py:843] 2021-03-15 05:39:33,828 >> Total optimization steps = 2685\n",
"{'loss': 2.7071, 'learning_rate': 4.068901303538175e-05, 'epoch': 0.56}\n",
" 19% 500/2685 [01:39<07:14, 5.03it/s][INFO|trainer.py:1408] 2021-03-15 05:41:13,197 >> Saving model checkpoint to ./test-mlm/checkpoint-500\n",
"[INFO|configuration_utils.py:304] 2021-03-15 05:41:13,199 >> Configuration saved in ./test-mlm/checkpoint-500/config.json\n",
"[INFO|modeling_utils.py:817] 2021-03-15 05:41:14,457 >> Model weights saved in ./test-mlm/checkpoint-500/pytorch_model.bin\n",
"{'loss': 2.5856, 'learning_rate': 3.13780260707635e-05, 'epoch': 1.12}\n",
" 37% 1000/2685 [03:23<05:34, 5.03it/s][INFO|trainer.py:1408] 2021-03-15 05:42:57,398 >> Saving model checkpoint to ./test-mlm/checkpoint-1000\n",
"[INFO|configuration_utils.py:304] 2021-03-15 05:42:57,399 >> Configuration saved in ./test-mlm/checkpoint-1000/config.json\n",
"[INFO|modeling_utils.py:817] 2021-03-15 05:42:58,615 >> Model weights saved in ./test-mlm/checkpoint-1000/pytorch_model.bin\n",
"{'loss': 2.5064, 'learning_rate': 2.206703910614525e-05, 'epoch': 1.68}\n",
" 56% 1500/2685 [05:07<03:53, 5.07it/s][INFO|trainer.py:1408] 2021-03-15 05:44:41,722 >> Saving model checkpoint to ./test-mlm/checkpoint-1500\n",
"[INFO|configuration_utils.py:304] 2021-03-15 05:44:41,723 >> Configuration saved in ./test-mlm/checkpoint-1500/config.json\n",
"[INFO|modeling_utils.py:817] 2021-03-15 05:44:42,948 >> Model weights saved in ./test-mlm/checkpoint-1500/pytorch_model.bin\n",
" 61% 1634/2685 [05:39<03:29, 5.02it/s]"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "fDq7EV6XYTho"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment