Created
March 15, 2021 05:45
-
-
Save Beomi/cc87708b978a4f4ed87793762c019866 to your computer and use it in GitHub Desktop.
2021.03.15. KcBERT MLM Finetune with Petition Dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "2021.03.15. KcBERT MLM Finetune with Petition Dataset", | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true, | |
"machine_shape": "hm", | |
"authorship_tag": "ABX9TyPkJnRAn/rZZxXG+EirhdSe", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU", | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"9faa5adcf806489793a7d149b1b42b65": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_aa687d20633e4f4983e8050973f179d6", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_a8fd2c7b26f94d3b92f8907c2107ff55", | |
"IPY_MODEL_05b38759833a41d3bfb17475d3ad7f01", | |
"IPY_MODEL_2c9648eda92f477f83700bd0db7e473c" | |
] | |
} | |
}, | |
"aa687d20633e4f4983e8050973f179d6": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"a8fd2c7b26f94d3b92f8907c2107ff55": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_98872de6b08d42329de1d89dac042d33", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": "100%", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_77b77cb3df4a42cf9ef7e2226d800121" | |
} | |
}, | |
"05b38759833a41d3bfb17475d3ad7f01": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_0bf215884d134117acd1ab1bcf87d4b7", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "success", | |
"max": 20, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 20, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_96830e13310f444bb887c797f95b0a6f" | |
} | |
}, | |
"2c9648eda92f477f83700bd0db7e473c": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_c12e47c0f60049c181b6eeac324e7c0d", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 20/20 [00:09<00:00, 2.55it/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_089f6882066b46eaa9cc92fd043db7d9" | |
} | |
}, | |
"98872de6b08d42329de1d89dac042d33": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"77b77cb3df4a42cf9ef7e2226d800121": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"0bf215884d134117acd1ab1bcf87d4b7": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"96830e13310f444bb887c797f95b0a6f": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"c12e47c0f60049c181b6eeac324e7c0d": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"089f6882066b46eaa9cc92fd043db7d9": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"7a0d1e56284b417c8daff6471e4dee81": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_e1df92b7c6eb468eb4a9c73aff0d828a", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_5796de99da2b473a8c73bfe696825dc7", | |
"IPY_MODEL_7f97c030c7534310a7d47dbf422f56bd", | |
"IPY_MODEL_930cd013a7bf45b2849b8963f8c7f7a4" | |
] | |
} | |
}, | |
"e1df92b7c6eb468eb4a9c73aff0d828a": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"5796de99da2b473a8c73bfe696825dc7": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_5d0dd0a218ef4d75926dd84812d7f7ee", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": "100%", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_3ebf901c3e5d4f49b7ac8e65bac8f3be" | |
} | |
}, | |
"7f97c030c7534310a7d47dbf422f56bd": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_98600149fa204fe8ae64cb288548f5c3", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "success", | |
"max": 3704, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 3704, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_d89775056ac8482c9b52c638215036bb" | |
} | |
}, | |
"930cd013a7bf45b2849b8963f8c7f7a4": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_b851d679ea904e4f874794aee2b9c741", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 3704/3704 [01:24<00:00, 57.08it/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_b55df5f244f744389204e6446cd74e16" | |
} | |
}, | |
"5d0dd0a218ef4d75926dd84812d7f7ee": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"3ebf901c3e5d4f49b7ac8e65bac8f3be": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"98600149fa204fe8ae64cb288548f5c3": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"d89775056ac8482c9b52c638215036bb": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"b851d679ea904e4f874794aee2b9c741": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"b55df5f244f744389204e6446cd74e16": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
} | |
} | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Beomi/cc87708b978a4f4ed87793762c019866/2021-03-15-kcbert-mlm-finetune-with-petition-dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XxaDw3JIXrDf" | |
}, | |
"source": [ | |
"# 필요한 패키지 설치\n", | |
"\n", | |
"- Korpora: 데이터셋 다운로드\n", | |
"- emoji: 이모지코드\n", | |
"- soynlp: Preprocesisng\n", | |
"- kss: 한국어 문장 분리기\n", | |
"- transformers: MLM 학습 및 데이터셋\n", | |
" - datasets\n", | |
" - protobuf\n", | |
" - sentencepiece" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "lqA2SU2sWqjR" | |
}, | |
"source": [ | |
"!pip install -q Korpora emoji soynlp kss transformers \"datasets >= 1.1.3\" \"sentencepiece != 0.1.92\" protobuf" | |
], | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "0nAbFcVFXzV4" | |
}, | |
"source": [ | |
"# 예시용 데이터셋 다운로드\n", | |
"\n", | |
"- 여기서는 Korean petitions dataset(국민청원 데이터셋)을 사용\n", | |
"- 전체 중 동의 수가 1000건 초과인 본문만 사용" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "3Z7ji3QnW7JG" | |
}, | |
"source": [ | |
"from Korpora import Korpora" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "erTvWvZCW91J", | |
"outputId": "b673e35a-14f2-4acd-9a4a-f9c01bd22c42" | |
}, | |
"source": [ | |
"Korpora.fetch('korean_petitions', root_dir='./Korpora')" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-08\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-09\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-10\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-11\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2017-12\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-01\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-02\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-03\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-04\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-05\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-06\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-07\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-08\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-09\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-10\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-11\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2018-12\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-01\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-02\n", | |
"[Korpora] Corpus `korean_petitions` is already installed at /content/Korpora/korean_petitions/petitions_2019-03\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Gk4EXcX3W-LO" | |
}, | |
"source": [ | |
"from glob import glob" | |
], | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "XPQDC74FXAxb", | |
"outputId": "f87e9593-81c5-4104-eeed-37ad5917c290" | |
}, | |
"source": [ | |
"dataset = glob('./Korpora/korean_petitions/petitions*')\n", | |
"dataset" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['./Korpora/korean_petitions/petitions_2018-07',\n", | |
" './Korpora/korean_petitions/petitions_2019-01',\n", | |
" './Korpora/korean_petitions/petitions_2017-11',\n", | |
" './Korpora/korean_petitions/petitions_2018-01',\n", | |
" './Korpora/korean_petitions/petitions_2017-09',\n", | |
" './Korpora/korean_petitions/petitions_2018-06',\n", | |
" './Korpora/korean_petitions/petitions_2018-10',\n", | |
" './Korpora/korean_petitions/petitions_2018-11',\n", | |
" './Korpora/korean_petitions/petitions_2017-12',\n", | |
" './Korpora/korean_petitions/petitions_2019-02',\n", | |
" './Korpora/korean_petitions/petitions_2018-02',\n", | |
" './Korpora/korean_petitions/petitions_2018-08',\n", | |
" './Korpora/korean_petitions/petitions_2018-04',\n", | |
" './Korpora/korean_petitions/petitions_2019-03',\n", | |
" './Korpora/korean_petitions/petitions_2018-05',\n", | |
" './Korpora/korean_petitions/petitions_2018-09',\n", | |
" './Korpora/korean_petitions/petitions_2017-10',\n", | |
" './Korpora/korean_petitions/petitions_2017-08',\n", | |
" './Korpora/korean_petitions/petitions_2018-12',\n", | |
" './Korpora/korean_petitions/petitions_2018-03']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Ssdr7HBMX8Xo" | |
}, | |
"source": [ | |
"# 데이터 로딩\n", | |
"\n", | |
"- pandas로 `content` 부분만 읽어 파일로 만들기\n", | |
"- kss로 각 청원 게시글 내 문장 분리 \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "eS_NOWeWXB-x" | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"from tqdm.auto import tqdm" | |
], | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 49, | |
"referenced_widgets": [ | |
"9faa5adcf806489793a7d149b1b42b65", | |
"aa687d20633e4f4983e8050973f179d6", | |
"a8fd2c7b26f94d3b92f8907c2107ff55", | |
"05b38759833a41d3bfb17475d3ad7f01", | |
"2c9648eda92f477f83700bd0db7e473c", | |
"98872de6b08d42329de1d89dac042d33", | |
"77b77cb3df4a42cf9ef7e2226d800121", | |
"0bf215884d134117acd1ab1bcf87d4b7", | |
"96830e13310f444bb887c797f95b0a6f", | |
"c12e47c0f60049c181b6eeac324e7c0d", | |
"089f6882066b46eaa9cc92fd043db7d9" | |
] | |
}, | |
"id": "oXnOcNi8XDC8", | |
"outputId": "369b48b0-5b1c-4378-d62c-99e2ba20c58f" | |
}, | |
"source": [ | |
"df = pd.concat([pd.read_json(i, lines=True) for i in tqdm(dataset)])" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "9faa5adcf806489793a7d149b1b42b65", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
" 0%| | 0/20 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 289 | |
}, | |
"id": "yH94XSQfXEHU", | |
"outputId": "fb7c84c2-7307-4544-adb6-49ce19066472" | |
}, | |
"source": [ | |
"df.head()" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>category</th>\n", | |
" <th>begin</th>\n", | |
" <th>end</th>\n", | |
" <th>content</th>\n", | |
" <th>num_agree</th>\n", | |
" <th>petition_idx</th>\n", | |
" <th>status</th>\n", | |
" <th>title</th>\n", | |
" <th>replies</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>일자리</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>2018-07-31</td>\n", | |
" <td>포괄임금제 및 52시간 근로제 취지와 다른 악의적 적용 사례를 들어 폐지 및 보완을...</td>\n", | |
" <td>406</td>\n", | |
" <td>291662</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>포괄임금제 폐지 요청 및 주52시간 관련</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>보건복지</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>2018-07-31</td>\n", | |
" <td>의료사고로 의사의오진으로 병명이 바뀌어서 생명에 지장이 왔다갔다한 경우, 옮긴병원에...</td>\n", | |
" <td>14</td>\n", | |
" <td>291663</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>억울한 의료 사고에 대한 내용</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>외교/통일/국방</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>2018-07-31</td>\n", | |
" <td>대한민국 세계유일의 분단국가이자 휴전중인 상황에서 대한민국의 젊은 청춘들은 국방의 ...</td>\n", | |
" <td>62</td>\n", | |
" <td>291664</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>형편없는 대한민국 국군 의료시스템 혁신적인 개선이 필요합니다.</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>육아/교육</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>2018-07-31</td>\n", | |
" <td>어제 교육부의 2022년 수능개편안을 보았습니다. 정말 어처구니 없는 내용이었습니다...</td>\n", | |
" <td>12</td>\n", | |
" <td>291666</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>수능 개편 반대합니다.</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>외교/통일/국방</td>\n", | |
" <td>2018-07-01</td>\n", | |
" <td>2018-07-31</td>\n", | |
" <td>국민의 4대의무중 국방의무가 있습니다 대한민국의 남자라면 대부분 국가의 부름을받습니...</td>\n", | |
" <td>18</td>\n", | |
" <td>291667</td>\n", | |
" <td>청원종료</td>\n", | |
" <td>국방의무를 다합시다. 대체복무 반대</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" category begin ... title replies\n", | |
"0 일자리 2018-07-01 ... 포괄임금제 폐지 요청 및 주52시간 관련 NaN\n", | |
"1 보건복지 2018-07-01 ... 억울한 의료 사고에 대한 내용 NaN\n", | |
"2 외교/통일/국방 2018-07-01 ... 형편없는 대한민국 국군 의료시스템 혁신적인 개선이 필요합니다. NaN\n", | |
"3 육아/교육 2018-07-01 ... 수능 개편 반대합니다. NaN\n", | |
"4 외교/통일/국방 2018-07-01 ... 국방의무를 다합시다. 대체복무 반대 NaN\n", | |
"\n", | |
"[5 rows x 9 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "_EmRjLbOXFHX", | |
"outputId": "8e9eea74-7311-4841-fa1a-abf978790d1c" | |
}, | |
"source": [ | |
"len(df)" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"433631" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 9 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "PZAqRMwBXG1i" | |
}, | |
"source": [ | |
"agreed_df = df[df['num_agree'] > 1000]" | |
], | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "O5eeT1UZXUUn", | |
"outputId": "58736a25-c0a5-4e05-8622-73a0ceacf9ca" | |
}, | |
"source": [ | |
"len(agreed_df)" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"3704" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 11 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1T8OvUwDXVM3" | |
}, | |
"source": [ | |
"import re\n", | |
"import emoji\n", | |
"from soynlp.normalizer import repeat_normalize\n", | |
"\n", | |
"emojis = ''.join(emoji.UNICODE_EMOJI.keys())\n", | |
"pattern = re.compile(f'[^ .,?!/@$%~%·∼()\\x00-\\x7Fㄱ-ㅣ가-힣{emojis}]+')\n", | |
"url_pattern = re.compile(\n", | |
" r'https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\\+.~#?&//=]*)')\n", | |
"\n", | |
"def clean(x):\n", | |
" x = pattern.sub(' ', x)\n", | |
" x = url_pattern.sub('', x)\n", | |
" x = x.strip()\n", | |
" x = repeat_normalize(x, num_repeats=2)\n", | |
" return x" | |
], | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "z59q9G4gXWVh" | |
}, | |
"source": [ | |
"contents = agreed_df['content'].map(clean).to_list()" | |
], | |
"execution_count": 13, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "V-VI74f3XXV-" | |
}, | |
"source": [ | |
"from kss import split_sentences" | |
], | |
"execution_count": 14, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fJtIEVBpY58Z" | |
}, | |
"source": [ | |
"import os" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 66, | |
"referenced_widgets": [ | |
"7a0d1e56284b417c8daff6471e4dee81", | |
"e1df92b7c6eb468eb4a9c73aff0d828a", | |
"5796de99da2b473a8c73bfe696825dc7", | |
"7f97c030c7534310a7d47dbf422f56bd", | |
"930cd013a7bf45b2849b8963f8c7f7a4", | |
"5d0dd0a218ef4d75926dd84812d7f7ee", | |
"3ebf901c3e5d4f49b7ac8e65bac8f3be", | |
"98600149fa204fe8ae64cb288548f5c3", | |
"d89775056ac8482c9b52c638215036bb", | |
"b851d679ea904e4f874794aee2b9c741", | |
"b55df5f244f744389204e6446cd74e16" | |
] | |
}, | |
"id": "i0RriVk2XY05", | |
"outputId": "fc1e1665-e4de-45d1-d1a8-31661bea2624" | |
}, | |
"source": [ | |
"if not os.path.exists('korean_petitions_safe.txt'):\n", | |
" with open('korean_petitions_safe.txt', 'w') as f:\n", | |
" for doc in tqdm(contents):\n", | |
" for line in split_sentences(doc, safe=True):\n", | |
" f.write(line+'\\n')\n", | |
" f.write('\\n')" | |
], | |
"execution_count": 15, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "7a0d1e56284b417c8daff6471e4dee81", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
" 0%| | 0/3704 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"end\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9KWyr7_LYE8q" | |
}, | |
"source": [ | |
"# KcBERT-base MLM Finetune 학습하기 (GPU)\n", | |
"\n", | |
"- Huggingface Transformers에서 제공하는 `run_mlm.py` 파일을 이용해 KcBERT weight과 vocab을 이용해 MLM 학습 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "q9Rta_FPXZ0L", | |
"outputId": "c457dd55-cadf-4b84-c9fc-881b80f5e61f" | |
}, | |
"source": [ | |
"!mkdir ./test-mlm" | |
], | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"mkdir: cannot create directory ‘./test-mlm’: File exists\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "bumVyJhPXcZe", | |
"outputId": "d3301679-1405-4055-d4b3-fae3d2535173" | |
}, | |
"source": [ | |
"!wget -nc https://raw.githubusercontent.com/huggingface/transformers/4c32f9f26e6a84f0d9843fec8757e6ce640bb44e/examples/language-modeling/run_mlm.py" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"File ‘run_mlm.py’ already there; not retrieving.\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Q5E3GLcOYs9y", | |
"outputId": "3601ca5d-b277-481f-92ee-078f636f3d79" | |
}, | |
"source": [ | |
"!nvidia-smi" | |
], | |
"execution_count": 18, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Mon Mar 15 05:38:33 2021 \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 460.56 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", | |
"|-------------------------------+----------------------+----------------------+\n", | |
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|===============================+======================+======================|\n", | |
"| 0 Tesla V100-SXM2... Off | 00000000:00:04.0 Off | 0 |\n", | |
"| N/A 33C P0 24W / 300W | 0MiB / 16160MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-------------------------------+----------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=============================================================================|\n", | |
"| No running processes found |\n", | |
"+-----------------------------------------------------------------------------+\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "ikFSD-VzXbVS", | |
"outputId": "c7ce54b2-68b0-4038-d430-9a6107145491" | |
}, | |
"source": [ | |
"!python run_mlm.py \\\n", | |
" --model_name_or_path beomi/kcbert-base \\\n", | |
" --train_file korean_petitions_safe.txt \\\n", | |
" --do_train \\\n", | |
" --output_dir ./test-mlm" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"2021-03-15 05:38:37.559396: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0\n", | |
"03/15/2021 05:38:38 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False\n", | |
"03/15/2021 05:38:38 - INFO - __main__ - Training/evaluation parameters TrainingArguments(output_dir=./test-mlm, overwrite_output_dir=False, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=1, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Mar15_05-38-38_ae41adae085d, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=None, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./test-mlm, disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, sharded_ddp=False, deepspeed=None, label_smoothing_factor=0.0, adafactor=False, group_by_length=False, report_to=['tensorboard'], ddp_find_unused_parameters=None, dataloader_pin_memory=True, _n_gpu=1)\n", | |
"03/15/2021 05:38:39 - WARNING - datasets.builder - Using custom data configuration default-b170ee81aef401ea\n", | |
"Downloading and preparing dataset text/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/text/default-b170ee81aef401ea/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57...\n", | |
"Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-b170ee81aef401ea/0.0.0/293ecb642f9fca45b44ad1f90c8445c54b9d80b95ab3fca3cfa5e1e3d85d4a57. Subsequent calls will reuse this data.\n", | |
"[INFO|file_utils.py:1302] 2021-03-15 05:38:39,663 >> https://huggingface.co/beomi/kcbert-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpn3xpel08\n", | |
"Downloading: 100% 619/619 [00:00<00:00, 586kB/s]\n", | |
"[INFO|file_utils.py:1306] 2021-03-15 05:38:39,932 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n", | |
"[INFO|file_utils.py:1309] 2021-03-15 05:38:39,933 >> creating metadata file for /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n", | |
"[INFO|configuration_utils.py:449] 2021-03-15 05:38:39,933 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n", | |
"[INFO|configuration_utils.py:485] 2021-03-15 05:38:39,933 >> Model config BertConfig {\n", | |
" \"architectures\": [\n", | |
" \"BertForMaskedLM\"\n", | |
" ],\n", | |
" \"attention_probs_dropout_prob\": 0.1,\n", | |
" \"directionality\": \"bidi\",\n", | |
" \"gradient_checkpointing\": false,\n", | |
" \"hidden_act\": \"gelu\",\n", | |
" \"hidden_dropout_prob\": 0.1,\n", | |
" \"hidden_size\": 768,\n", | |
" \"initializer_range\": 0.02,\n", | |
" \"intermediate_size\": 3072,\n", | |
" \"layer_norm_eps\": 1e-12,\n", | |
" \"max_position_embeddings\": 300,\n", | |
" \"model_type\": \"bert\",\n", | |
" \"num_attention_heads\": 12,\n", | |
" \"num_hidden_layers\": 12,\n", | |
" \"pad_token_id\": 0,\n", | |
" \"pooler_fc_size\": 768,\n", | |
" \"pooler_num_attention_heads\": 12,\n", | |
" \"pooler_num_fc_layers\": 3,\n", | |
" \"pooler_size_per_head\": 128,\n", | |
" \"pooler_type\": \"first_token_transform\",\n", | |
" \"position_embedding_type\": \"absolute\",\n", | |
" \"transformers_version\": \"4.3.3\",\n", | |
" \"type_vocab_size\": 2,\n", | |
" \"use_cache\": true,\n", | |
" \"vocab_size\": 30000\n", | |
"}\n", | |
"\n", | |
"[INFO|configuration_utils.py:449] 2021-03-15 05:38:40,201 >> loading configuration file https://huggingface.co/beomi/kcbert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/10de039f2f91b0c6fbd30fad5bf8a7468a20701212ed12f9f5e610edb99c55d1.d8a72131e15fd1d856f1b39abf4eff31d458aeeca0a4192df898ca699ec7d779\n", | |
"[INFO|configuration_utils.py:485] 2021-03-15 05:38:40,202 >> Model config BertConfig {\n", | |
" \"architectures\": [\n", | |
" \"BertForMaskedLM\"\n", | |
" ],\n", | |
" \"attention_probs_dropout_prob\": 0.1,\n", | |
" \"directionality\": \"bidi\",\n", | |
" \"gradient_checkpointing\": false,\n", | |
" \"hidden_act\": \"gelu\",\n", | |
" \"hidden_dropout_prob\": 0.1,\n", | |
" \"hidden_size\": 768,\n", | |
" \"initializer_range\": 0.02,\n", | |
" \"intermediate_size\": 3072,\n", | |
" \"layer_norm_eps\": 1e-12,\n", | |
" \"max_position_embeddings\": 300,\n", | |
" \"model_type\": \"bert\",\n", | |
" \"num_attention_heads\": 12,\n", | |
" \"num_hidden_layers\": 12,\n", | |
" \"pad_token_id\": 0,\n", | |
" \"pooler_fc_size\": 768,\n", | |
" \"pooler_num_attention_heads\": 12,\n", | |
" \"pooler_num_fc_layers\": 3,\n", | |
" \"pooler_size_per_head\": 128,\n", | |
" \"pooler_type\": \"first_token_transform\",\n", | |
" \"position_embedding_type\": \"absolute\",\n", | |
" \"transformers_version\": \"4.3.3\",\n", | |
" \"type_vocab_size\": 2,\n", | |
" \"use_cache\": true,\n", | |
" \"vocab_size\": 30000\n", | |
"}\n", | |
"\n", | |
"[INFO|tokenization_utils_base.py:1688] 2021-03-15 05:38:40,202 >> Model name 'beomi/kcbert-base' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). Assuming 'beomi/kcbert-base' is a path, a model identifier, or url to a directory containing tokenizer files.\n", | |
"[INFO|file_utils.py:1302] 2021-03-15 05:38:40,472 >> https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpn5ot4e1n\n", | |
"Downloading: 100% 250k/250k [00:00<00:00, 736kB/s]\n", | |
"[INFO|file_utils.py:1306] 2021-03-15 05:38:41,081 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n", | |
"[INFO|file_utils.py:1309] 2021-03-15 05:38:41,081 >> creating metadata file for /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n", | |
"[INFO|file_utils.py:1302] 2021-03-15 05:38:42,152 >> https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpw_x2bo_z\n", | |
"Downloading: 100% 49.0/49.0 [00:00<00:00, 43.4kB/s]\n", | |
"[INFO|file_utils.py:1306] 2021-03-15 05:38:42,422 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n", | |
"[INFO|file_utils.py:1309] 2021-03-15 05:38:42,422 >> creating metadata file for /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n", | |
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/527aa95c387f7c7aa3bebe490a9ede81af16f407b169db730d22632d5822b640.1b39769be8fe13da6152a54d35d7973b687b1aa6067771885d39610963e29dbe\n", | |
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer.json from cache at None\n", | |
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/added_tokens.json from cache at None\n", | |
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,422 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/special_tokens_map.json from cache at None\n", | |
"[INFO|tokenization_utils_base.py:1786] 2021-03-15 05:38:42,423 >> loading file https://huggingface.co/beomi/kcbert-base/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/21078f0099ac15db7a5163f4fea7f742808c30cb0393f1ee56a43dc56d9eb082.cca45b9490565b45e1c62cf5a0529b670fc5ab0db2d4a4af99f6ac577b673eb1\n", | |
"[INFO|file_utils.py:1302] 2021-03-15 05:38:42,739 >> https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpv6awhvr7\n", | |
"Downloading: 100% 438M/438M [00:15<00:00, 28.2MB/s]\n", | |
"[INFO|file_utils.py:1306] 2021-03-15 05:38:58,916 >> storing https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n", | |
"[INFO|file_utils.py:1309] 2021-03-15 05:38:58,917 >> creating metadata file for /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n", | |
"[INFO|modeling_utils.py:1027] 2021-03-15 05:38:58,917 >> loading weights file https://huggingface.co/beomi/kcbert-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/1c204bf1f008ee734eeb5ce678b148d14fa298802ce16d879c92a22a52527a0e.6cdf570ee57a7f6a5c727c436a4c26d8e9601ddaa1377ebcb16b7285d76125cd\n", | |
"[WARNING|modeling_utils.py:1135] 2021-03-15 05:39:02,868 >> Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n", | |
"- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", | |
"- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", | |
"[INFO|modeling_utils.py:1152] 2021-03-15 05:39:02,868 >> All the weights of BertForMaskedLM were initialized from the model checkpoint at beomi/kcbert-base.\n", | |
"If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.\n", | |
" 0% 0/71 [00:00<?, ?ba/s][WARNING|tokenization_utils_base.py:3213] 2021-03-15 05:39:02,958 >> Token indices sequence length is longer than the specified maximum sequence length for this model (329 > 300). Running this sequence through the model will result in indexing errors\n", | |
"100% 71/71 [00:03<00:00, 19.30ba/s]\n", | |
"100% 71/71 [00:20<00:00, 3.43ba/s]\n", | |
"[INFO|trainer.py:432] 2021-03-15 05:39:33,825 >> The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.\n", | |
"[INFO|trainer.py:837] 2021-03-15 05:39:33,828 >> ***** Running training *****\n", | |
"[INFO|trainer.py:838] 2021-03-15 05:39:33,828 >> Num examples = 7159\n", | |
"[INFO|trainer.py:839] 2021-03-15 05:39:33,828 >> Num Epochs = 3\n", | |
"[INFO|trainer.py:840] 2021-03-15 05:39:33,828 >> Instantaneous batch size per device = 8\n", | |
"[INFO|trainer.py:841] 2021-03-15 05:39:33,828 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n", | |
"[INFO|trainer.py:842] 2021-03-15 05:39:33,828 >> Gradient Accumulation steps = 1\n", | |
"[INFO|trainer.py:843] 2021-03-15 05:39:33,828 >> Total optimization steps = 2685\n", | |
"{'loss': 2.7071, 'learning_rate': 4.068901303538175e-05, 'epoch': 0.56}\n", | |
" 19% 500/2685 [01:39<07:14, 5.03it/s][INFO|trainer.py:1408] 2021-03-15 05:41:13,197 >> Saving model checkpoint to ./test-mlm/checkpoint-500\n", | |
"[INFO|configuration_utils.py:304] 2021-03-15 05:41:13,199 >> Configuration saved in ./test-mlm/checkpoint-500/config.json\n", | |
"[INFO|modeling_utils.py:817] 2021-03-15 05:41:14,457 >> Model weights saved in ./test-mlm/checkpoint-500/pytorch_model.bin\n", | |
"{'loss': 2.5856, 'learning_rate': 3.13780260707635e-05, 'epoch': 1.12}\n", | |
" 37% 1000/2685 [03:23<05:34, 5.03it/s][INFO|trainer.py:1408] 2021-03-15 05:42:57,398 >> Saving model checkpoint to ./test-mlm/checkpoint-1000\n", | |
"[INFO|configuration_utils.py:304] 2021-03-15 05:42:57,399 >> Configuration saved in ./test-mlm/checkpoint-1000/config.json\n", | |
"[INFO|modeling_utils.py:817] 2021-03-15 05:42:58,615 >> Model weights saved in ./test-mlm/checkpoint-1000/pytorch_model.bin\n", | |
"{'loss': 2.5064, 'learning_rate': 2.206703910614525e-05, 'epoch': 1.68}\n", | |
" 56% 1500/2685 [05:07<03:53, 5.07it/s][INFO|trainer.py:1408] 2021-03-15 05:44:41,722 >> Saving model checkpoint to ./test-mlm/checkpoint-1500\n", | |
"[INFO|configuration_utils.py:304] 2021-03-15 05:44:41,723 >> Configuration saved in ./test-mlm/checkpoint-1500/config.json\n", | |
"[INFO|modeling_utils.py:817] 2021-03-15 05:44:42,948 >> Model weights saved in ./test-mlm/checkpoint-1500/pytorch_model.bin\n", | |
" 61% 1634/2685 [05:39<03:29, 5.02it/s]" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fDq7EV6XYTho" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment