Last active
August 16, 2021 05:42
-
-
Save Beomi/972c6442a9c15a22dfd1903d0bb0f577 to your computer and use it in GitHub Desktop.
2021.03.15. KcBERT MLM Finetune with Petition Dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "2021.03.15. KcBERT MLM Finetune with Petition Dataset", | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true, | |
"machine_shape": "hm", | |
"authorship_tag": "ABX9TyPThxv5ZZ2ERKRzbL6DStPo", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU", | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"24b919ba13f34f6aa148dc6435e450d5": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_e184b36746744b1ea8faf0fece80a377", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_7f1d80c730ca40f1ab8609986f9b7079", | |
"IPY_MODEL_72aad332eb6b4534b8d9bce460905849", | |
"IPY_MODEL_41e24ce51d304a89bffe7b7be661ba54" | |
] | |
} | |
}, | |
"e184b36746744b1ea8faf0fece80a377": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"7f1d80c730ca40f1ab8609986f9b7079": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_100c672a20e7416bacbdecf6a7417111", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": "100%", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_ece5002e3e064a73ba3be461907b4a9e" | |
} | |
}, | |
"72aad332eb6b4534b8d9bce460905849": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_ef854ae062a3411aa407fc8ef36c2fa9", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "success", | |
"max": 20, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 20, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_e69ee077a90e4d93852685ec424f6b1f" | |
} | |
}, | |
"41e24ce51d304a89bffe7b7be661ba54": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_12837fed7e2a46fea6eb1b2175c35c2f", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 20/20 [00:09<00:00, 2.08it/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_00cebb6154394ee884baf32814059c1e" | |
} | |
}, | |
"100c672a20e7416bacbdecf6a7417111": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"ece5002e3e064a73ba3be461907b4a9e": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"ef854ae062a3411aa407fc8ef36c2fa9": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"e69ee077a90e4d93852685ec424f6b1f": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"12837fed7e2a46fea6eb1b2175c35c2f": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"00cebb6154394ee884baf32814059c1e": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"a619d5c077494c628bfd589032f0057b": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_b687225b255a4b0a9945fb5a5b219f48", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_ec0bea3e70644604b3e8253d0d2ae528", | |
"IPY_MODEL_bb0da81e63fb4b1b8f29d0b80d86248d", | |
"IPY_MODEL_211d736187bf49dd930bf6c61e3619b6" | |
] | |
} | |
}, | |
"b687225b255a4b0a9945fb5a5b219f48": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"ec0bea3e70644604b3e8253d0d2ae528": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_9e88a788fead47a1a90af692d94942ff", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 0%", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_3b7c4bc39fca494dba816c92c0f32cc8" | |
} | |
}, | |
"bb0da81e63fb4b1b8f29d0b80d86248d": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_fb9c04093d6d4850b5a8f35da73ecb56", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "danger", | |
"max": 3704, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 13, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_91e2e116a06849079b045fe5e916c54f" | |
} | |
}, | |
"211d736187bf49dd930bf6c61e3619b6": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_36bc81bec3644b8b99f98524eb7492b1", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 13/3704 [00:01<13:01, 4.72it/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_7713dc447df04ebe8c42fbe717a3767b" | |
} | |
}, | |
"9e88a788fead47a1a90af692d94942ff": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"3b7c4bc39fca494dba816c92c0f32cc8": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"fb9c04093d6d4850b5a8f35da73ecb56": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"91e2e116a06849079b045fe5e916c54f": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"36bc81bec3644b8b99f98524eb7492b1": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"model_module_version": "1.5.0", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"7713dc447df04ebe8c42fbe717a3767b": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"model_module_version": "1.2.0", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
} | |
} | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Beomi/972c6442a9c15a22dfd1903d0bb0f577/2021-03-15-kcbert-mlm-finetune-with-petition-dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XxaDw3JIXrDf" | |
}, | |
"source": [ | |
"# 필요한 패키지 설치\n", | |
"\n", | |
"- Korpora: 데이터셋 다운로드\n", | |
"- emoji: 이모지코드\n", | |
"- soynlp: Preprocesisng\n", | |
"- kss: 한국어 문장 분리기\n", | |
"- transformers: MLM 학습 및 데이터셋\n", | |
" - datasets\n", | |
" - protobuf\n", | |
" - sentencepiece" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "lqA2SU2sWqjR", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "a592d3c8-0d98-4871-ba23-41bf6bd95c12" | |
}, | |
"source": [ | |
"!pip install -q Korpora emoji soynlp \"kss<2.6\" transformers \"datasets >= 1.1.3\" \"sentencepiece != 0.1.92\" protobuf" | |
], | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\u001b[?25l\r\u001b[K |█████ | 10 kB 31.3 MB/s eta 0:00:01\r\u001b[K |██████████ | 20 kB 18.5 MB/s eta 0:00:01\r\u001b[K |███████████████ | 30 kB 11.0 MB/s eta 0:00:01\r\u001b[K |███████████████████▉ | 40 kB 8.8 MB/s eta 0:00:01\r\u001b[K |████████████████████████▉ | 51 kB 5.0 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▉ | 61 kB 5.5 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 65 kB 2.5 MB/s \n", | |
"\u001b[?25h" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "0nAbFcVFXzV4" | |
}, | |
"source": [ | |
"# 예시용 데이터셋 다운로드\n", | |
"\n", | |
"- 여기서는 Korean petitions dataset(국민청원 데이터셋)을 사용\n", | |
"- 전체 중 동의 수가 1000건 초과인 본문만 사용" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "3Z7ji3QnW7JG" | |
}, | |
"source": [ | |
"from Korpora import Korpora" | |
], | |
"execution_count": 27, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "erTvWvZCW91J", | |
"outputId": "54739e2f-aa45-48ab-bcf4-859005a8ed0b" | |
}, | |
"source": [ | |
"Korpora.fetch('korean_petitions', root_dir='./Korpora')" | |
], | |
"execution_count": 28, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-08\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-09\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-10\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-11\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-12\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-01\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-02\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-03\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-04\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-05\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-06\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-07\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-08\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-09\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-10\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-11\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-12\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-01\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-02\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-03\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Gk4EXcX3W-LO" | |
}, | |
"source": [ | |
"from glob import glob" | |
], | |
"execution_count": 29, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "XPQDC74FXAxb", | |
"outputId": "85fbe915-26c5-45f0-d91a-1a3835d00e21" | |
}, | |
"source": [ | |
"dataset = glob('./Korpora/korean_petitions/petitions*')\n", | |
"dataset" | |
], | |
"execution_count": 30, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['./Korpora/korean_petitions/petitions_2018-06',\n", | |
" './Korpora/korean_petitions/petitions_2018-05',\n", | |
" './Korpora/korean_petitions/petitions_2018-01',\n", | |
" './Korpora/korean_petitions/petitions_2017-11',\n", | |
" './Korpora/korean_petitions/petitions_2017-09',\n", | |
" './Korpora/korean_petitions/petitions_2018-10',\n", | |
" './Korpora/korean_petitions/petitions_2019-03',\n", | |
" './Korpora/korean_petitions/petitions_2019-02',\n", | |
" './Korpora/korean_petitions/petitions_2018-03',\n", | |
" './Korpora/korean_petitions/petitions_2017-08',\n", | |
" './Korpora/korean_petitions/petitions_2019-01',\n", | |
" './Korpora/korean_petitions/petitions_2018-11',\n", | |
" './Korpora/korean_petitions/petitions_2018-09',\n", | |
" './Korpora/korean_petitions/petitions_2018-07',\n", | |
" './Korpora/korean_petitions/petitions_2017-10',\n", | |
" './Korpora/korean_petitions/petitions_2018-12',\n", | |
" './Korpora/korean_petitions/petitions_2018-02',\n", | |
" './Korpora/korean_petitions/petitions_2018-08',\n", | |
" './Korpora/korean_petitions/petitions_2018-04',\n", | |
" './Korpora/korean_petitions/petitions_2017-12']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 30 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Ssdr7HBMX8Xo" | |
}, | |
"source": [ | |
"# 데이터 로딩\n", | |
"\n", | |
"- pandas로 `content` 부분만 읽어 파일로 만들기\n", | |
"- kss로 각 청원 게시글 내 문장 분리 \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "eS_NOWeWXB-x" | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"from tqdm.auto import tqdm" | |
], | |
"execution_count": 31, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 49, | |
"referenced_widgets": [ | |
"24b919ba13f34f6aa148dc6435e450d5", | |
"e184b36746744b1ea8faf0fece80a377", | |
"7f1d80c730ca40f1ab8609986f9b7079", | |
"72aad332eb6b4534b8d9bce460905849", | |
"41e24ce51d304a89bffe7b7be661ba54", | |
"100c672a20e7416bacbdecf6a7417111", | |
"ece5002e3e064a73ba3be461907b4a9e", | |
"ef854ae062a3411aa407fc8ef36c2fa9", | |
"e69ee077a90e4d93852685ec424f6b1f", | |
"12837fed7e2a46fea6eb1b2175c35c2f", | |
"00cebb6154394ee884baf32814059c1e" | |
] | |
}, | |
"id": "oXnOcNi8XDC8", | |
"outputId": "0a3b73c5-973a-44bd-899b-5fe559d63523" | |
}, | |
"source": [ | |
"df = pd.concat([pd.read_json(i, lines=True) for i in tqdm(dataset)])" | |
], | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "24b919ba13f34f6aa148dc6435e450d5", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
" 0%| | 0/20 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
}, | |
"id": "yH94XSQfXEHU", | |
"outputId": "6154d041-bc63-4f8f-dc76-082c0e1bc94b" | |
}, | |
"source": [ | |
"df.head()" | |
], | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>category</th>\n", | |
" <th>begin</th>\n", | |
" <th>end</th>\n", | |
" <th>content</th>\n", | |
" <th>num_agree</th>\n", | |
" <th>petition_idx</th>\n", | |
" <th>status</th>\n", | |
" <th>title</th>\n", | |
" <th>replies</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>미래</td>\n", | |
" <td>2018-06-01</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>중국과 5년간 일때문에 교류를 하면서, 통역을 맡은 아가씨들이 모두 조선족아가씨 였...</td>\n", | |
" <td>9</td>\n", | |
" <td>257860</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>조선족은 중국사람 입니다!</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>기타</td>\n", | |
" <td>2018-06-01</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>안녕하세요 저는 18세 남자입니다 요즘 페북이나 청와대 홈페이지를 통해 청원들을 보...</td>\n", | |
" <td>3</td>\n", | |
" <td>257861</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>청원의 문제점</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>문화/예술/체육/언론</td>\n", | |
" <td>2018-06-01</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>어제 오늘 중계보고 있으려니 속터지네요 감독작전도 무대책 ᆢ리시브 기본도 없는 선수...</td>\n", | |
" <td>0</td>\n", | |
" <td>257862</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>여자대표 철수해주세요</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>정치개혁</td>\n", | |
" <td>2018-06-01</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>대통령님 덕분에 우리나라가 좋아졌다는걸 느껴요 항상 국민 옆에 계셔야 해요!</td>\n", | |
" <td>0</td>\n", | |
" <td>257863</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>문대통령님 오랫동안 대통령하시면 안될까요?</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>육아/교육</td>\n", | |
" <td>2018-06-01</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>급식이 너무 맛이 없고 가격도 비싸서 재원생의 원성이 자자합니다. 알고보니 급식업체...</td>\n", | |
" <td>34</td>\n", | |
" <td>257864</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>시대인재학원 급식 좀 맛있게 해주세요</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" category begin end ... status title replies\n", | |
"0 미래 2018-06-01 2018-07-01 ... 청원종료 조선족은 중국사람 입니다! NaN\n", | |
"1 기타 2018-06-01 2018-07-01 ... 청원종료 청원의 문제점 NaN\n", | |
"2 문화/예술/체육/언론 2018-06-01 2018-07-01 ... 청원종료 여자대표 철수해주세요 NaN\n", | |
"3 정치개혁 2018-06-01 2018-07-01 ... 청원종료 문대통령님 오랫동안 대통령하시면 안될까요? NaN\n", | |
"4 육아/교육 2018-06-01 2018-07-01 ... 청원종료 시대인재학원 급식 좀 맛있게 해주세요 NaN\n", | |
"\n", | |
"[5 rows x 9 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 33 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "_EmRjLbOXFHX", | |
"outputId": "3b8fe782-d0ce-450e-aabe-8fde6ce995c5" | |
}, | |
"source": [ | |
"len(df)" | |
], | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"433631" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 34 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "PZAqRMwBXG1i" | |
}, | |
"source": [ | |
"agreed_df = df[df['num_agree'] > 1000]" | |
], | |
"execution_count": 35, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "O5eeT1UZXUUn", | |
"outputId": "5fadd314-f8db-4145-8253-a1b1f574a50e" | |
}, | |
"source": [ | |
"len(agreed_df)" | |
], | |
"execution_count": 36, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"3704" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 36 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1T8OvUwDXVM3" | |
}, | |
"source": [ | |
"import re\n", | |
"import emoji\n", | |
"from soynlp.normalizer import repeat_normalize\n", | |
"\n", | |
"emojis = ''.join(emoji.UNICODE_EMOJI.keys())\n", | |
"pattern = re.compile(f'[^ .,?!/@$%~%·∼()\\x00-\\x7Fㄱ-ㅣ가-힣{emojis}]+')\n", | |
"url_pattern = re.compile(\n", | |
" r'https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*)')\n", | |
"\n", | |
"def clean(x):\n", | |
" x = pattern.sub(' ', x)\n", | |
" x = url_pattern.sub('', x)\n", | |
" x = x.strip()\n", | |
" x = repeat_normalize(x, num_repeats=2)\n", | |
" return x" | |
], | |
"execution_count": 37, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "z59q9G4gXWVh" | |
}, | |
"source": [ | |
"contents = agreed_df['content'].map(clean).to_list()" | |
], | |
"execution_count": 38, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "V-VI74f3XXV-" | |
}, | |
"source": [ | |
"from kss import split_sentences" | |
], | |
"execution_count": 39, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fJtIEVBpY58Z" | |
}, | |
"source": [ | |
"import os" | |
], | |
"execution_count": 40, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0H8sIa3CehbT", | |
"outputId": "c84a9404-d592-4d16-a21e-9a175c8ccec1", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"source": [ | |
"split_sentences(\"안녕하세요. 오늘은 날씨가 좋더라구요.\")" | |
], | |
"execution_count": 41, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['안녕하세요.', '오늘은 날씨가 좋더라구요.']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 41 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 322, | |
"referenced_widgets": [ | |
"a619d5c077494c628bfd589032f0057b", | |
"b687225b255a4b0a9945fb5a5b219f48", | |
"ec0bea3e70644604b3e8253d0d2ae528", | |
"bb0da81e63fb4b1b8f29d0b80d86248d", | |
"211d736187bf49dd930bf6c61e3619b6", | |
"9e88a788fead47a1a90af692d94942ff", | |
"3b7c4bc39fca494dba816c92c0f32cc8", | |
"fb9c04093d6d4850b5a8f35da73ecb56", | |
"91e2e116a06849079b045fe5e916c54f", | |
"36bc81bec3644b8b99f98524eb7492b1", | |
"7713dc447df04ebe8c42fbe717a3767b" | |
] | |
}, | |
"id": "i0RriVk2XY05", | |
"outputId": "ef307a1d-123c-4682-ae72-d2d4d04ddde7" | |
}, | |
"source": [ | |
"with open('korean_petitions_safe.txt', 'w') as f:\n", | |
" for doc in tqdm(contents):\n", | |
" if doc:\n", | |
" for line in split_sentences(doc):\n", | |
" f.write(line+'\\n')\n", | |
" f.write('\\n')\n", | |
" f.close()" | |
], | |
"execution_count": 43, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "a619d5c077494c628bfd589032f0057b", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
" 0%| | 0/3704 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "error", | |
"ename": "TypeError", | |
"evalue": "ignored", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-43-c1df52e7c6de>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdoc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msplit_sentences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'\\n'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/kss/kss.py\u001b[0m in \u001b[0;36msplit_sentences\u001b[0;34m(text, use_heuristic, max_recover_step, max_recover_length, ignore_quotes_or_brackets)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ms\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mneed_to_replace_zwsp\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34mf\"\\u200b{s}\\u200b\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 162\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 163\u001b[0m \u001b[0mprev_1\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0mprev_2\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/kss/kss.py\u001b[0m in \u001b[0;36m_split_sentences\u001b[0;34m(text, use_heuristic, max_recover_step, max_recover_length, ignore_quotes_or_brackets, recover_step)\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0mcur_sentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdo_trim_sent_push_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_sentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mTable\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcur_stat\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mprev_1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mID\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNEXT1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 324\u001b[0m \u001b[0mcur_sentence\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mprev_1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0mdo_trim_sent_push_results\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcur_sentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;32m/usr/local/lib/python3.7/dist-packages/kss/base.py\u001b[0m in \u001b[0;36mprocess\u001b[0;34m(self, cur_chr, prev_1, prev_2, prev_3, prev_4, single_stack, double_stack)\u001b[0m\n", | |
"\u001b[0;31mTypeError\u001b[0m: do_push_pop_symbol() missing 1 required positional argument: 'current_ch'" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9KWyr7_LYE8q" | |
}, | |
"source": [ | |
"# KcBERT-base MLM Finetune 학습하기 (GPU)\n", | |
"\n", | |
"- Huggingface Transformers에서 제공하는 `run_mlm.py` 파일을 이용해 KcBERT weight과 vocab을 이용해 MLM 학습 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "q9Rta_FPXZ0L", | |
"outputId": "458e40e2-bd56-4696-bac0-f96e1b21aa74" | |
}, | |
"source": [ | |
"!mkdir ./test-mlm" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"mkdir: cannot create directory ‘./test-mlm’: File exists\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "bumVyJhPXcZe", | |
"outputId": "ca4b7f32-5b78-4235-eae8-d7982c5c0c4c" | |
}, | |
"source": [ | |
"!wget -O run_mlm.py https://raw.githubusercontent.com/huggingface/transformers/72aee83ced5f31302c5e331d896412737287f976/examples/pytorch/language-modeling/run_mlm.py" | |
], | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2021-08-16 05:39:13-- https://raw.githubusercontent.com/huggingface/transformers/72aee83ced5f31302c5e331d896412737287f976/examples/pytorch/language-modeling/run_mlm.py\n", | |
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", | |
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 24078 (24K) [text/plain]\n", | |
"Saving to: ‘run_mlm.py’\n", | |
"\n", | |
"\rrun_mlm.py 0%[ ] 0 --.-KB/s \rrun_mlm.py 100%[===================>] 23.51K --.-KB/s in 0.002s \n", | |
"\n", | |
"2021-08-16 05:39:14 (15.1 MB/s) - ‘run_mlm.py’ saved [24078/24078]\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Q5E3GLcOYs9y", | |
"outputId": "169e7894-4f23-4b83-deaa-9711260add5b" | |
}, | |
"source": [ | |
"!nvidia-smi" | |
], | |
"execution_count": 19, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Mon Aug 16 05:39:14 2021 \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 470.42.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", | |
"|-------------------------------+----------------------+----------------------+\n", | |
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|===============================+======================+======================|\n", | |
"| 0 Tesla P100-PCIE... Off | 00000000:00:04.0 Off | 0 |\n", | |
"| N/A 40C P0 27W / 250W | 0MiB / 16280MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-------------------------------+----------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=============================================================================|\n", | |
"| No running processes found |\n", | |
"+-----------------------------------------------------------------------------+\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "eYJkBneJd9wd", | |
"outputId": "37d181a6-2c10-41b2-ba97-918566f4c8bd" | |
}, | |
"source": [ | |
"!ls" | |
], | |
"execution_count": 20, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"korean_petitions_safe.txt\tKorpora sample_data\n", | |
"korean_petitions_safe.txt.lock\trun_mlm.py test-mlm\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4763DVvUeBPm" | |
}, | |
"source": [ | |
"!head -n 10 korean_petitions_safe.txt" | |
], | |
"execution_count": 21, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "ikFSD-VzXbVS", | |
"outputId": "9bc3b559-6771-46d8-a8a9-55376e8653b3" | |
}, | |
"source": [ | |
"!python run_mlm.py \\\n", | |
" --model_name_or_path beomi/kcbert-base \\\n", | |
" --train_file korean_petitions_safe.txt \\\n", | |
" --do_train \\\n", | |
" --output_dir ./test-mlm" | |
], | |
"execution_count": 22, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"2021-08-16 05:39:15.893093: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", | |
"08/16/2021 05:39:17 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", | |
"08/16/2021 05:39:17 - INFO - __main__ - Training/evaluation parameters TrainingArguments(\n", | |
"_n_gpu=1,\n", | |
"adafactor=False,\n", | |
"adam_beta1=0.9,\n", | |
"adam_beta2=0.999,\n", | |
"adam_epsilon=1e-08,\n", | |
"dataloader_drop_last=False,\n", | |
"dataloader_num_workers=0,\n", | |
"dataloader_pin_memory=True,\n", | |
"ddp_find_unused_parameters=None,\n", | |
"debug=[],\n", | |
"deepspeed=None,\n", | |
"disable_tqdm=False,\n", | |
"do_eval=False,\n", | |
"do_predict=False,\n", | |
"do_train=True,\n", | |
"eval_accumulation_steps=None,\n", | |
"eval_steps=None,\n", | |
"evaluation_strategy=IntervalStrategy.NO,\n", | |
"fp16=False,\n", | |
"fp16_backend=auto,\n", | |
"fp16_full_eval=False,\n", | |
"fp16_opt_level=O1,\n", | |
"gradient_accumulation_steps=1,\n", | |
"greater_is_better=None,\n", | |
"group_by_length=False,\n", | |
"ignore_data_skip=False,\n", | |
"label_names=None,\n", | |
"label_smoothing_factor=0.0,\n", | |
"learning_rate=5e-05,\n", | |
"length_column_name=length,\n", | |
"load_best_model_at_end=False,\n", | |
"local_rank=-1,\n", | |
"log_level=-1,\n", | |
"log_level_replica=-1,\n", | |
"log_on_each_node=True,\n", | |
"logging_dir=./test-mlm/runs/Aug16_05-39-17_cb60fe2c1a57,\n", | |
"logging_first_step=False,\n", | |
"logging_steps=500,\n", | |
"logging_strategy=IntervalStrategy.STEPS,\n", | |
"lr_scheduler_type=SchedulerType.LINEAR,\n", | |
"max_grad_norm=1.0,\n", | |
"max_steps=-1,\n", | |
"metric_for_best_model=None,\n", | |
"mp_parameters=,\n", | |
"no_cuda=False,\n", | |
"num_train_epochs=3.0,\n", | |
"output_dir=./test-mlm,\n", | |
"overwrite_output_dir=False,\n", | |
"past_index=-1,\n", | |
"per_device_eval_batch_size=8,\n", | |
"per_device_train_batch_size=8,\n", | |
"prediction_loss_only=False,\n", | |
"push_to_hub=False,\n", | |
"push_to_hub_model_id=test-mlm,\n", | |
"push_to_hub_organization=None,\n", | |
"push_to_hub_token=None,\n", | |
"remove_unused_columns=True,\n", | |
"report_to=['tensorboard'],\n", | |
"resume_from_checkpoint=None,\n", | |
"run_name=./test-mlm,\n", | |
"save_on_each_node=False,\n", | |
"save_steps=500,\n", | |
"save_strategy=IntervalStrategy.STEPS,\n", | |
"save_total_limit=None,\n", | |
"seed=42,\n", | |
"sharded_ddp=[],\n", | |
"skip_memory_metrics=True,\n", | |
"tpu_metrics_debug=False,\n", | |
"tpu_num_cores=None,\n", | |
"use_legacy_prediction_loop=False,\n", | |
"warmup_ratio=0.0,\n", | |
"warmup_steps=0,\n", | |
"weight_decay=0.0,\n", | |
")\n", | |
"08/16/2021 05:39:18 - WARNING - datasets.builder - Using custom data configuration default-dd91b8b7dab8cd99\n", | |
"08/16/2021 05:39:18 - INFO - datasets.builder - Generating dataset text (/root/.cache/huggingface/datasets/text/default-dd91b8b7dab8cd99/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)\n", | |
"Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-dd91b8b7dab8cd99/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...\n", | |
"100% 1/1 [00:00<00:00, 9020.01it/s]\n", | |
"08/16/2021 05:39:18 - INFO - datasets.utils.download_manager - Downloading took 0.0 min\n", | |
"08/16/2021 05:39:18 - INFO - datasets.utils.download_manager - Checksum Computation took 0.0 min\n", | |
"100% 1/1 [00:00<00:00, 1340.03it/s]\n", | |
"08/16/2021 05:39:18 - INFO - datasets.utils.info_utils - Unable to verify checksums.\n", | |
"08/16/2021 05:39:18 - INFO - datasets.builder - Generating split train\n", | |
"Traceback (most recent call last):\n", | |
" File \"run_mlm.py\", line 550, in <module>\n", | |
" main()\n", | |
" File \"run_mlm.py\", line 287, in main\n", | |
" raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)\n", | |
" File \"/usr/local/lib/python3.7/dist-packages/datasets/load.py\", line 852, in load_dataset\n", | |
" use_auth_token=use_auth_token,\n", | |
" File \"/usr/local/lib/python3.7/dist-packages/datasets/builder.py\", line 616, in download_and_prepare\n", | |
" dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs\n", | |
" File \"/usr/local/lib/python3.7/dist-packages/datasets/builder.py\", line 693, in _download_and_prepare\n", | |
" self._prepare_split(split_generator, **prepare_split_kwargs)\n", | |
" File \"/usr/local/lib/python3.7/dist-packages/datasets/builder.py\", line 1166, in _prepare_split\n", | |
" num_examples, num_bytes = writer.finalize()\n", | |
" File \"/usr/local/lib/python3.7/dist-packages/datasets/arrow_writer.py\", line 425, in finalize\n", | |
" raise ValueError(\"Please pass `features` or at least one example when writing data\")\n", | |
"ValueError: Please pass `features` or at least one example when writing data\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "81n_IaCRdP2w" | |
}, | |
"source": [ | |
"### 학습 완료후 아래 에러는 무시하셔도 됩니다.\n", | |
"\n", | |
"- 학습 완료된 파일들은 `test-mlm` 폴더 내에 들어있습니다 :)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "wDXIj3uadNYh" | |
}, | |
"source": [ | |
"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fDq7EV6XYTho" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": 22, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment