Beomi · March 15, 2021 05:45
diff --git a/2021-03-15-kcbert-mlm-finetune-with-petition-dataset.ipynb b/2021-03-15-kcbert-mlm-finetune-with-petition-dataset.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "2021.03.15. KcBERT MLM Finetune with Petition Dataset",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true,
      "machine_shape": "hm",
      "authorship_tag": "ABX9TyPkJnRAn/rZZxXG+EirhdSe",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "9faa5adcf806489793a7d149b1b42b65": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_aa687d20633e4f4983e8050973f179d6",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_a8fd2c7b26f94d3b92f8907c2107ff55",
              "IPY_MODEL_05b38759833a41d3bfb17475d3ad7f01",
              "IPY_MODEL_2c9648eda92f477f83700bd0db7e473c"
            ]
          }
        },
        "aa687d20633e4f4983e8050973f179d6": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "a8fd2c7b26f94d3b92f8907c2107ff55": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_98872de6b08d42329de1d89dac042d33",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_77b77cb3df4a42cf9ef7e2226d800121"
          }
        },
        "05b38759833a41d3bfb17475d3ad7f01": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_0bf215884d134117acd1ab1bcf87d4b7",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 20,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 20,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_96830e13310f444bb887c797f95b0a6f"
          }
        },
        "2c9648eda92f477f83700bd0db7e473c": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_c12e47c0f60049c181b6eeac324e7c0d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 20/20 [00:09&lt;00:00,  2.55it/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_089f6882066b46eaa9cc92fd043db7d9"
          }
        },
        "98872de6b08d42329de1d89dac042d33": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "77b77cb3df4a42cf9ef7e2226d800121": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "0bf215884d134117acd1ab1bcf87d4b7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "96830e13310f444bb887c797f95b0a6f": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "c12e47c0f60049c181b6eeac324e7c0d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "089f6882066b46eaa9cc92fd043db7d9": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "7a0d1e56284b417c8daff6471e4dee81": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_e1df92b7c6eb468eb4a9c73aff0d828a",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_5796de99da2b473a8c73bfe696825dc7",
              "IPY_MODEL_7f97c030c7534310a7d47dbf422f56bd",
              "IPY_MODEL_930cd013a7bf45b2849b8963f8c7f7a4"
            ]
          }
        },
        "e1df92b7c6eb468eb4a9c73aff0d828a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "5796de99da2b473a8c73bfe696825dc7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_5d0dd0a218ef4d75926dd84812d7f7ee",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": "100%",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_3ebf901c3e5d4f49b7ac8e65bac8f3be"
          }
        },
        "7f97c030c7534310a7d47dbf422f56bd": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_98600149fa204fe8ae64cb288548f5c3",
            "_dom_classes": [],
            "description": "",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 3704,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 3704,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_d89775056ac8482c9b52c638215036bb"
          }
        },
        "930cd013a7bf45b2849b8963f8c7f7a4": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_b851d679ea904e4f874794aee2b9c741",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 3704/3704 [01:24&lt;00:00, 57.08it/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_b55df5f244f744389204e6446cd74e16"
          }
        },
        "5d0dd0a218ef4d75926dd84812d7f7ee": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "3ebf901c3e5d4f49b7ac8e65bac8f3be": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "98600149fa204fe8ae64cb288548f5c3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "d89775056ac8482c9b52c638215036bb": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "b851d679ea904e4f874794aee2b9c741": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "b55df5f244f744389204e6446cd74e16": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/Beomi/cc87708b978a4f4ed87793762c019866/2021-03-15-kcbert-mlm-finetune-with-petition-dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XxaDw3JIXrDf"
      },
      "source": [
        "# 필요한 패키지 설치\n",
        "\n",
        "- Korpora: 데이터셋 다운로드\n",
        "- emoji: 이모지코드\n",
        "- soynlp: Preprocesisng\n",
        "- kss: 한국어 문장 분리기\n",
        "- transformers: MLM 학습 및 데이터셋\n",
        "  - datasets\n",
        "  - protobuf\n",
        "  - sentencepiece"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lqA2SU2sWqjR"
      },
      "source": [
        "!pip install -q Korpora emoji soynlp kss transformers \"datasets >= 1.1.3\" \"sentencepiece != 0.1.92\" protobuf"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0nAbFcVFXzV4"
      },
      "source": [
        "# 예시용 데이터셋 다운로드\n",
        "\n",
        "- 여기서는 Korean petitions dataset(국민청원 데이터셋)을 사용\n",
        "- 전체 중 동의 수가 1000건 초과인 본문만 사용"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3Z7ji3QnW7JG"
      },
      "source": [
        "from Korpora import Korpora"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "erTvWvZCW91J",
        "outputId": "b673e35a-14f2-4acd-9a4a-f9c01bd22c42"
      },
      "source": [
        "Korpora.fetch('korean_petitions', root_dir='./Korpora')"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-08\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-09\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-10\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-11\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-12\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-01\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-02\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-03\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-04\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-05\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-06\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-07\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-08\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-09\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-10\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-11\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-12\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-01\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-02\n",
            "[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-03\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Gk4EXcX3W-LO"
      },
      "source": [
        "from glob import glob"
      ],
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XPQDC74FXAxb",
        "outputId": "f87e9593-81c5-4104-eeed-37ad5917c290"
      },
      "source": [
        "dataset = glob('./Korpora/korean_petitions/petitions*')\n",
        "dataset"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['./Korpora/korean_petitions/petitions_2018-07',\n",
              " './Korpora/korean_petitions/petitions_2019-01',\n",
              " './Korpora/korean_petitions/petitions_2017-11',\n",
              " './Korpora/korean_petitions/petitions_2018-01',\n",
              " './Korpora/korean_petitions/petitions_2017-09',\n",
              " './Korpora/korean_petitions/petitions_2018-06',\n",
              " './Korpora/korean_petitions/petitions_2018-10',\n",
              " './Korpora/korean_petitions/petitions_2018-11',\n",
              " './Korpora/korean_petitions/petitions_2017-12',\n",
              " './Korpora/korean_petitions/petitions_2019-02',\n",
              " './Korpora/korean_petitions/petitions_2018-02',\n",
              " './Korpora/korean_petitions/petitions_2018-08',\n",
              " './Korpora/korean_petitions/petitions_2018-04',\n",
              " './Korpora/korean_petitions/petitions_2019-03',\n",
              " './Korpora/korean_petitions/petitions_2018-05',\n",
              " './Korpora/korean_petitions/petitions_2018-09',\n",
              " './Korpora/korean_petitions/petitions_2017-10',\n",
              " './Korpora/korean_petitions/petitions_2017-08',\n",
              " './Korpora/korean_petitions/petitions_2018-12',\n",
              " './Korpora/korean_petitions/petitions_2018-03']"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ssdr7HBMX8Xo"
      },
      "source": [
        "# 데이터 로딩\n",
        "\n",
        "- pandas로 `content` 부분만 읽어 파일로 만들기\n",
        "- kss로 각 청원 게시글 내 문장 분리 \n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eS_NOWeWXB-x"
      },
      "source": [
        "import pandas as pd\n",
        "from tqdm.auto import tqdm"
      ],
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "9faa5adcf806489793a7d149b1b42b65",
            "aa687d20633e4f4983e8050973f179d6",
            "a8fd2c7b26f94d3b92f8907c2107ff55",
            "05b38759833a41d3bfb17475d3ad7f01",
            "2c9648eda92f477f83700bd0db7e473c",
            "98872de6b08d42329de1d89dac042d33",
            "77b77cb3df4a42cf9ef7e2226d800121",
            "0bf215884d134117acd1ab1bcf87d4b7",
            "96830e13310f444bb887c797f95b0a6f",
            "c12e47c0f60049c181b6eeac324e7c0d",
            "089f6882066b46eaa9cc92fd043db7d9"
          ]
        },
        "id": "oXnOcNi8XDC8",
        "outputId": "369b48b0-5b1c-4378-d62c-99e2ba20c58f"
      },
      "source": [
        "df = pd.concat([pd.read_json(i, lines=True) for i in tqdm(dataset)])"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "9faa5adcf806489793a7d149b1b42b65",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "  0%|          | 0/20 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 289
        },
        "id": "yH94XSQfXEHU",
        "outputId": "fb7c84c2-7307-4544-adb6-49ce19066472"
      },
      "source": [
        "df.head()"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>category</th>\n",
              "      <th>begin</th>\n",
              "      <th>end</th>\n",
              "      <th>content</th>\n",
              "      <th>num_agree</th>\n",
              "      <th>petition_idx</th>\n",
              "      <th>status</th>\n",
              "      <th>title</th>\n",
              "      <th>replies</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>일자리</td>\n",
              "      <td>2018-07-01</td>\n",
              "      <td>2018-07-31</td>\n",
              "      <td>포괄임금제 및 52시간 근로제 취지와 다른 악의적 적용 사례를 들어 폐지 및 보완을...</td>\n",
              "      <td>406</td>\n",
              "      <td>291662</td>\n",
              "      <td>청원종료</td>\n",
              "      <td>포괄임금제 폐지 요청 및 주52시간 관련</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>보건복지</td>\n",
              "      <td>2018-07-01</td>\n",
              "      <td>2018-07-31</td>\n",
              "      <td>의료사고로 의사의오진으로 병명이 바뀌어서 생명에 지장이 왔다갔다한 경우, 옮긴병원에...</td>\n",
              "      <td>14</td>\n",
              "      <td>291663</td>\n",
              "      <td>청원종료</td>\n",
              "      <td>억울한 의료 사고에 대한 내용</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>외교/통일/국방</td>\n",
              "      <td>2018-07-01</td>\n",
              "      <td>2018-07-31</td>\n",
              "      <td>대한민국 세계유일의 분단국가이자 휴전중인 상황에서 대한민국의 젊은 청춘들은 국방의 ...</td>\n",
              "      <td>62</td>\n",
              "      <td>291664</td>\n",
              "      <td>청원종료</td>\n",
              "      <td>형편없는 대한민국 국군 의료시스템 혁신적인 개선이 필요합니다.</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>육아/교육</td>\n",
              "      <td>2018-07-01</td>\n",
              "      <td>2018-07-31</td>\n",
              "      <td>어제 교육부의 2022년 수능개편안을 보았습니다. 정말 어처구니 없는 내용이었습니다...</td>\n",
              "      <td>12</td>\n",
              "      <td>291666</td>\n",
              "      <td>청원종료</td>\n",
              "      <td>수능 개편 반대합니다.</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>외교/통일/국방</td>\n",
              "      <td>2018-07-01</td>\n",
              "      <td>2018-07-31</td>\n",
              "      <td>국민의 4대의무중 국방의무가 있습니다 대한민국의 남자라면 대부분 국가의 부름을받습니...</td>\n",
              "      <td>18</td>\n",
              "      <td>291667</td>\n",
              "      <td>청원종료</td>\n",
              "      <td>국방의무를 다합시다. 대체복무 반대</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   category       begin  ...                               title replies\n",
              "0       일자리  2018-07-01  ...              포괄임금제 폐지 요청 및 주52시간 관련     NaN\n",
              "1      보건복지  2018-07-01  ...                    억울한 의료 사고에 대한 내용     NaN\n",
              "2  외교/통일/국방  2018-07-01  ...  형편없는 대한민국 국군 의료시스템 혁신적인 개선이 필요합니다.     NaN\n",
              "3     육아/교육  2018-07-01  ...                        수능 개편 반대합니다.     NaN\n",
              "4  외교/통일/국방  2018-07-01  ...                 국방의무를 다합시다. 대체복무 반대     NaN\n",
              "\n",
              "[5 rows x 9 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_EmRjLbOXFHX",
        "outputId": "8e9eea74-7311-4841-fa1a-abf978790d1c"
      },
      "source": [
        "len(df)"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "433631"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "PZAqRMwBXG1i"
      },
      "source": [
        "agreed_df = df[df['num_agree'] > 1000]"
      ],
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "O5eeT1UZXUUn",
        "outputId": "58736a25-c0a5-4e05-8622-73a0ceacf9ca"
      },
      "source": [
        "len(agreed_df)"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "3704"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1T8OvUwDXVM3"
      },
      "source": [
        "import re\n",
        "import emoji\n",
        "from soynlp.normalizer import repeat_normalize\n",
        "\n",
        "emojis = ''.join(emoji.UNICODE_EMOJI.keys())\n",
        "pattern = re.compile(f'[^ .,?!/@$%~％·∼()\\x00-\\x7Fㄱ-ㅣ가-힣{emojis}]+')\n",
        "url_pattern = re.compile(\n",
        "    r'https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*)')\n",
        "\n",
        "def clean(x):\n",
        "    x = pattern.sub(' ', x)\n",
        "    x = url_pattern.sub('', x)\n",
        "    x = x.strip()\n",
        "    x = repeat_normalize(x, num_repeats=2)\n",
        "    return x"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "z59q9G4gXWVh"
      },
      "source": [
        "contents = agreed_df['content'].map(clean).to_list()"
      ],
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "V-VI74f3XXV-"
      },
      "source": [
        "from kss import split_sentences"
      ],
      "execution_count": 14,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fJtIEVBpY58Z"
      },
      "source": [
        "import os"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 66,
          "referenced_widgets": [
            "7a0d1e56284b417c8daff6471e4dee81",
            "e1df92b7c6eb468eb4a9c73aff0d828a",
            "5796de99da2b473a8c73bfe696825dc7",
            "7f97c030c7534310a7d47dbf422f56bd",
            "930cd013a7bf45b2849b8963f8c7f7a4",
            "5d0dd0a218ef4d75926dd84812d7f7ee",
            "3ebf901c3e5d4f49b7ac8e65bac8f3be",
            "98600149fa204fe8ae64cb288548f5c3",
            "d89775056ac8482c9b52c638215036bb",
            "b851d679ea904e4f874794aee2b9c741",
            "b55df5f244f744389204e6446cd74e16"
          ]
        },
        "id": "i0RriVk2XY05",
        "outputId": "fc1e1665-e4de-45d1-d1a8-31661bea2624"
      },
      "source": [
        "if not os.path.exists('korean_petitions_safe.txt'):\n",
        "    with open('korean_petitions_safe.txt', 'w') as f:\n",
        "        for doc in tqdm(contents):\n",
        "            for line in split_sentences(doc, safe=True):\n",
        "                f.write(line+'\\n')\n",
        "            f.write('\\n')"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "7a0d1e56284b417c8daff6471e4dee81",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "  0%|          | 0/3704 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "end\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9KWyr7_LYE8q"
      },
      "source": [
        "# KcBERT-base MLM Finetune 학습하기 (GPU)\n",
        "\n",
        "- Huggingface Transformers에서 제공하는 `run_mlm.py` 파일을 이용해 KcBERT weight과 vocab을 이용해 MLM 학습 "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "q9Rta_FPXZ0L",
        "outputId": "c457dd55-cadf-4b84-c9fc-881b80f5e61f"
      },
      "source": [
        "!mkdir ./test-mlm"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "mkdir: cannot create directory ‘./test-mlm’: File exists\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bumVyJhPXcZe",
        "outputId": "d3301679-1405-4055-d4b3-fae3d2535173"
      },
      "source": [
        "!wget -nc https://raw.githubusercontent.com/huggingface/transformers/4c32f9f26e6a84f0d9843fec8757e6ce640bb44e/examples/language-modeling/run_mlm.py"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "File ‘run_mlm.py’ already there; not retrieving.\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Q5E3GLcOYs9y",
        "outputId": "3601ca5d-b277-481f-92ee-078f636f3d79"
      },
      "source": [
        "!nvidia-smi"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mon Mar 15 05:38:33 2021       \n",
            "+-----------------------------------------------------------------------------+\n",
            "| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
            "|-------------------------------+----------------------+----------------------+\n",
            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
            "|                               |                      |               MIG M. |\n",
            "|===============================+======================+======================|\n",
            "|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |\n",
            "| N/A   33C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |\n",
            "|                               |                      |                  N/A |\n",
            "+-------------------------------+----------------------+----------------------+\n",
            "                                                                               \n",
            "+-----------------------------------------------------------------------------+\n",
            "| Processes:                                                                  |\n",
            "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
            "|        ID   ID                                                   Usage      |\n",
            "|=============================================================================|\n",
            "|  No running processes found                                                 |\n",
            "+-----------------------------------------------------------------------------+\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ikFSD-VzXbVS",
        "outputId": "c7ce54b2-68b0-4038-d430-9a6107145491"
      },
      "source": [
        "!python run_mlm.py \\\n",
        "    --model_name_or_path beomi/kcbert-base \\\n",
        "    --train_file korean_petitions_safe.txt \\\n",
        "    --do_train \\\n",
        "    --output_dir ./test-mlm"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "2021-03-15 05:38:37.559396: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n",
            "03/15/2021 05:38:38 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n",
            "03/15/2021 05:38:38 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir=./test-mlm, overwrite_output_dir=False, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Mar15_05-38-38_ae41adae085d, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./test-mlm, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=False, deepspeed=None, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, _n_gpu=1)\n",
            "03/15/2021 05:38:39 - WARNING - datasets.builder -   Using custom data configuration default-b170ee81aef401ea\n",
            "Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-b170ee81aef401ea/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57...\n",
            "Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-b170ee81aef401ea/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57. Subsequent calls will reuse this data.\n",
            "[INFO|file_utils.py:1302] 2021-03-15 05:38:39,663 >> https://huggingface.co/beomi/kcbert-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpn3xpel08\n",
            "Downloading: 100% 619/619 [00:00<00:00, 586kB/s]\n",
            "[INFO|file_utils.py:1306] 2021-03-15 05:38:39,932 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
            "[INFO|file_utils.py:1309] 2021-03-15 05:38:39,933 >> creating metadata file for /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
            "[INFO|configuration_utils.py:449] 2021-03-15 05:38:39,933 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
            "[INFO|configuration_utils.py:485] 2021-03-15 05:38:39,933 >> Model config BertConfig {\n",
            "  \"architectures\": [\n",
            "    \"BertForMaskedLM\"\n",
            "  ],\n",
            "  \"attention_probs_dropout_prob\": 0.1,\n",
            "  \"directionality\": \"bidi\",\n",
            "  \"gradient_checkpointing\": false,\n",
            "  \"hidden_act\": \"gelu\",\n",
            "  \"hidden_dropout_prob\": 0.1,\n",
            "  \"hidden_size\": 768,\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"intermediate_size\": 3072,\n",
            "  \"layer_norm_eps\": 1e-12,\n",
            "  \"max_position_embeddings\": 300,\n",
            "  \"model_type\": \"bert\",\n",
            "  \"num_attention_heads\": 12,\n",
            "  \"num_hidden_layers\": 12,\n",
            "  \"pad_token_id\": 0,\n",
            "  \"pooler_fc_size\": 768,\n",
            "  \"pooler_num_attention_heads\": 12,\n",
            "  \"pooler_num_fc_layers\": 3,\n",
            "  \"pooler_size_per_head\": 128,\n",
            "  \"pooler_type\": \"first_token_transform\",\n",
            "  \"position_embedding_type\": \"absolute\",\n",
            "  \"transformers_version\": \"4.3.3\",\n",
            "  \"type_vocab_size\": 2,\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 30000\n",
            "}\n",
            "\n",
            "[INFO|configuration_utils.py:449] 2021-03-15 05:38:40,201 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n",
            "[INFO|configuration_utils.py:485] 2021-03-15 05:38:40,202 >> Model config BertConfig {\n",
            "  \"architectures\": [\n",
            "    \"BertForMaskedLM\"\n",
            "  ],\n",
            "  \"attention_probs_dropout_prob\": 0.1,\n",
            "  \"directionality\": \"bidi\",\n",
            "  \"gradient_checkpointing\": false,\n",
            "  \"hidden_act\": \"gelu\",\n",
            "  \"hidden_dropout_prob\": 0.1,\n",
            "  \"hidden_size\": 768,\n",
            "  \"initializer_range\": 0.02,\n",
            "  \"intermediate_size\": 3072,\n",
            "  \"layer_norm_eps\": 1e-12,\n",
            "  \"max_position_embeddings\": 300,\n",
            "  \"model_type\": \"bert\",\n",
            "  \"num_attention_heads\": 12,\n",
            "  \"num_hidden_layers\": 12,\n",
            "  \"pad_token_id\": 0,\n",
            "  \"pooler_fc_size\": 768,\n",
            "  \"pooler_num_attention_heads\": 12,\n",
            "  \"pooler_num_fc_layers\": 3,\n",
            "  \"pooler_size_per_head\": 128,\n",
            "  \"pooler_type\": \"first_token_transform\",\n",
            "  \"position_embedding_type\": \"absolute\",\n",
            "  \"transformers_version\": \"4.3.3\",\n",
            "  \"type_vocab_size\": 2,\n",
            "  \"use_cache\": true,\n",
            "  \"vocab_size\": 30000\n",
            "}\n",
            "\n",
            "[INFO|tokenization_utils_base.py:1688] 2021-03-15 05:38:40,202 >> Model name 'beomi/kcbert-base' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). Assuming 'beomi/kcbert-base' is a path, a model identifier, or url to a directory containing tokenizer files.\n",
            "[INFO|file_utils.py:1302] 2021-03-15 05:38:40,472 >> https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpn5ot4e1n\n",
            "Downloading: 100% 250k/250k [00:00<00:00, 736kB/s]\n",
            "[INFO|file_utils.py:1306] 2021-03-15 05:38:41,081 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n",
            "[INFO|file_utils.py:1309] 2021-03-15 05:38:41,081 >> creating metadata file for /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n",
            "[INFO|file_utils.py:1302] 2021-03-15 05:38:42,152 >> https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpw_x2bo_z\n",
            "Downloading: 100% 49.0/49.0 [00:00<00:00, 43.4kB/s]\n",
            "[INFO|file_utils.py:1306] 2021-03-15 05:38:42,422 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n",
            "[INFO|file_utils.py:1309] 2021-03-15 05:38:42,422 >> creating metadata file for /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n",
            "[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n",
            "[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer.json from cache at None\n",
            "[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/added_tokens.json from cache at None\n",
            "[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/special_tokens_map.json from cache at None\n",
            "[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,423 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n",
            "[INFO|file_utils.py:1302] 2021-03-15 05:38:42,739 >> https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpv6awhvr7\n",
            "Downloading: 100% 438M/438M [00:15<00:00, 28.2MB/s]\n",
            "[INFO|file_utils.py:1306] 2021-03-15 05:38:58,916 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n",
            "[INFO|file_utils.py:1309] 2021-03-15 05:38:58,917 >> creating metadata file for /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n",
            "[INFO|modeling_utils.py:1027] 2021-03-15 05:38:58,917 >> loading weights file https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n",
            "[WARNING|modeling_utils.py:1135] 2021-03-15 05:39:02,868 >> Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
            "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
            "[INFO|modeling_utils.py:1152] 2021-03-15 05:39:02,868 >> All the weights of BertForMaskedLM were initialized from the model checkpoint at beomi/kcbert-base.\n",
            "If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.\n",
            "  0% 0/71 [00:00<?, ?ba/s][WARNING|tokenization_utils_base.py:3213] 2021-03-15 05:39:02,958 >> Token indices sequence length is longer than the specified maximum sequence length for this model (329 > 300). Running this sequence through the model will result in indexing errors\n",
            "100% 71/71 [00:03<00:00, 19.30ba/s]\n",
            "100% 71/71 [00:20<00:00,  3.43ba/s]\n",
            "[INFO|trainer.py:432] 2021-03-15 05:39:33,825 >> The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.\n",
            "[INFO|trainer.py:837] 2021-03-15 05:39:33,828 >> ***** Running training *****\n",
            "[INFO|trainer.py:838] 2021-03-15 05:39:33,828 >>   Num examples = 7159\n",
            "[INFO|trainer.py:839] 2021-03-15 05:39:33,828 >>   Num Epochs = 3\n",
            "[INFO|trainer.py:840] 2021-03-15 05:39:33,828 >>   Instantaneous batch size per device = 8\n",
            "[INFO|trainer.py:841] 2021-03-15 05:39:33,828 >>   Total train batch size (w. parallel, distributed & accumulation) = 8\n",
            "[INFO|trainer.py:842] 2021-03-15 05:39:33,828 >>   Gradient Accumulation steps = 1\n",
            "[INFO|trainer.py:843] 2021-03-15 05:39:33,828 >>   Total optimization steps = 2685\n",
            "{'loss': 2.7071, 'learning_rate': 4.068901303538175e-05, 'epoch': 0.56}\n",
            " 19% 500/2685 [01:39<07:14,  5.03it/s][INFO|trainer.py:1408] 2021-03-15 05:41:13,197 >> Saving model checkpoint to ./test-mlm/checkpoint-500\n",
            "[INFO|configuration_utils.py:304] 2021-03-15 05:41:13,199 >> Configuration saved in ./test-mlm/checkpoint-500/config.json\n",
            "[INFO|modeling_utils.py:817] 2021-03-15 05:41:14,457 >> Model weights saved in ./test-mlm/checkpoint-500/pytorch_model.bin\n",
            "{'loss': 2.5856, 'learning_rate': 3.13780260707635e-05, 'epoch': 1.12}\n",
            " 37% 1000/2685 [03:23<05:34,  5.03it/s][INFO|trainer.py:1408] 2021-03-15 05:42:57,398 >> Saving model checkpoint to ./test-mlm/checkpoint-1000\n",
            "[INFO|configuration_utils.py:304] 2021-03-15 05:42:57,399 >> Configuration saved in ./test-mlm/checkpoint-1000/config.json\n",
            "[INFO|modeling_utils.py:817] 2021-03-15 05:42:58,615 >> Model weights saved in ./test-mlm/checkpoint-1000/pytorch_model.bin\n",
            "{'loss': 2.5064, 'learning_rate': 2.206703910614525e-05, 'epoch': 1.68}\n",
            " 56% 1500/2685 [05:07<03:53,  5.07it/s][INFO|trainer.py:1408] 2021-03-15 05:44:41,722 >> Saving model checkpoint to ./test-mlm/checkpoint-1500\n",
            "[INFO|configuration_utils.py:304] 2021-03-15 05:44:41,723 >> Configuration saved in ./test-mlm/checkpoint-1500/config.json\n",
            "[INFO|modeling_utils.py:817] 2021-03-15 05:44:42,948 >> Model weights saved in ./test-mlm/checkpoint-1500/pytorch_model.bin\n",
            " 61% 1634/2685 [05:39<03:29,  5.02it/s]"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fDq7EV6XYTho"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
 }