-
-
Save kittinan/143f13c5c9108710c5c56d6b7981e2ac to your computer and use it in GitHub Desktop.
fake_berthai.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "fake_berthai.ipynb", | |
"provenance": [], | |
"collapsed_sections": [ | |
"psuL88nuoanq" | |
], | |
"authorship_tag": "ABX9TyPua2WUeSK7Orlb0u/RqH4I", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU", | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"5763d9c666104798b9142f7930feb082": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_bb5d731d4dd7459cb4cff26826981941", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_361718c1529e4752ba39be680d113ea5", | |
"IPY_MODEL_887b453cff9047a7a3a3bd2ab770b706" | |
] | |
} | |
}, | |
"bb5d731d4dd7459cb4cff26826981941": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"361718c1529e4752ba39be680d113ea5": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_5665589f88e34f84a77a13dffe4b1c6d", | |
"_dom_classes": [], | |
"description": "Downloading: 100%", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "success", | |
"max": 443, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 443, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_c17c3e07f0224529a97fef31bbdc11ad" | |
} | |
}, | |
"887b453cff9047a7a3a3bd2ab770b706": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_b693309e4f7b43f18ab854f862cccdb4", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 443/443 [00:00<00:00, 756B/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_9c46eaeee1fa4f1fb5dde626cdd04aa5" | |
} | |
}, | |
"5665589f88e34f84a77a13dffe4b1c6d": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "initial", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"c17c3e07f0224529a97fef31bbdc11ad": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"b693309e4f7b43f18ab854f862cccdb4": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"9c46eaeee1fa4f1fb5dde626cdd04aa5": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"f9edfdc8fb514f23ba55610df90314d2": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_b8d20cc2a1dc419f91c17fc51cf3aa3d", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_76900b67fe0448bc96ab847f01494eb7", | |
"IPY_MODEL_02226a523d904b00b438a51c2167280b" | |
] | |
} | |
}, | |
"b8d20cc2a1dc419f91c17fc51cf3aa3d": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"76900b67fe0448bc96ab847f01494eb7": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_f8fd175a2e0c41a2a75b93b50d14eb5e", | |
"_dom_classes": [], | |
"description": "Downloading: 100%", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "success", | |
"max": 231508, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 231508, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_7ed384326c894a859f4a29aef801baf5" | |
} | |
}, | |
"02226a523d904b00b438a51c2167280b": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_2a7cd115b9604a118fa61064ffca081f", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 232k/232k [00:00<00:00, 856kB/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_169db29a65624951ae84c016ca077689" | |
} | |
}, | |
"f8fd175a2e0c41a2a75b93b50d14eb5e": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "initial", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"7ed384326c894a859f4a29aef801baf5": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"2a7cd115b9604a118fa61064ffca081f": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"169db29a65624951ae84c016ca077689": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"3a85c7ca089048c481decb7f0fab19e8": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HBoxModel", | |
"state": { | |
"_view_name": "HBoxView", | |
"_dom_classes": [], | |
"_model_name": "HBoxModel", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"box_style": "", | |
"layout": "IPY_MODEL_e0892ebb37cf4972b5b304303bf3848b", | |
"_model_module": "@jupyter-widgets/controls", | |
"children": [ | |
"IPY_MODEL_47c174ee43094cea811a017228238278", | |
"IPY_MODEL_7d7fe817759a4034af8adafd25719010" | |
] | |
} | |
}, | |
"e0892ebb37cf4972b5b304303bf3848b": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"47c174ee43094cea811a017228238278": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "FloatProgressModel", | |
"state": { | |
"_view_name": "ProgressView", | |
"style": "IPY_MODEL_b1fd665f566f409a8c7e74e590034d7a", | |
"_dom_classes": [], | |
"description": "Downloading: 100%", | |
"_model_name": "FloatProgressModel", | |
"bar_style": "success", | |
"max": 1340675298, | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": 1340675298, | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"orientation": "horizontal", | |
"min": 0, | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_a94ce761446b47e5bd55ece5435efa18" | |
} | |
}, | |
"7d7fe817759a4034af8adafd25719010": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "HTMLModel", | |
"state": { | |
"_view_name": "HTMLView", | |
"style": "IPY_MODEL_01c732c5b9fb41d7b78a1b457711cd2d", | |
"_dom_classes": [], | |
"description": "", | |
"_model_name": "HTMLModel", | |
"placeholder": "", | |
"_view_module": "@jupyter-widgets/controls", | |
"_model_module_version": "1.5.0", | |
"value": " 1.34G/1.34G [00:28<00:00, 47.0MB/s]", | |
"_view_count": null, | |
"_view_module_version": "1.5.0", | |
"description_tooltip": null, | |
"_model_module": "@jupyter-widgets/controls", | |
"layout": "IPY_MODEL_75f56137297445939d8fddfcf2e2209a" | |
} | |
}, | |
"b1fd665f566f409a8c7e74e590034d7a": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "ProgressStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "ProgressStyleModel", | |
"description_width": "initial", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"bar_color": null, | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"a94ce761446b47e5bd55ece5435efa18": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
}, | |
"01c732c5b9fb41d7b78a1b457711cd2d": { | |
"model_module": "@jupyter-widgets/controls", | |
"model_name": "DescriptionStyleModel", | |
"state": { | |
"_view_name": "StyleView", | |
"_model_name": "DescriptionStyleModel", | |
"description_width": "", | |
"_view_module": "@jupyter-widgets/base", | |
"_model_module_version": "1.5.0", | |
"_view_count": null, | |
"_view_module_version": "1.2.0", | |
"_model_module": "@jupyter-widgets/controls" | |
} | |
}, | |
"75f56137297445939d8fddfcf2e2209a": { | |
"model_module": "@jupyter-widgets/base", | |
"model_name": "LayoutModel", | |
"state": { | |
"_view_name": "LayoutView", | |
"grid_template_rows": null, | |
"right": null, | |
"justify_content": null, | |
"_view_module": "@jupyter-widgets/base", | |
"overflow": null, | |
"_model_module_version": "1.2.0", | |
"_view_count": null, | |
"flex_flow": null, | |
"width": null, | |
"min_width": null, | |
"border": null, | |
"align_items": null, | |
"bottom": null, | |
"_model_module": "@jupyter-widgets/base", | |
"top": null, | |
"grid_column": null, | |
"overflow_y": null, | |
"overflow_x": null, | |
"grid_auto_flow": null, | |
"grid_area": null, | |
"grid_template_columns": null, | |
"flex": null, | |
"_model_name": "LayoutModel", | |
"justify_items": null, | |
"grid_row": null, | |
"max_height": null, | |
"align_content": null, | |
"visibility": null, | |
"align_self": null, | |
"height": null, | |
"min_height": null, | |
"padding": null, | |
"grid_auto_rows": null, | |
"grid_gap": null, | |
"max_width": null, | |
"order": null, | |
"_view_module_version": "1.2.0", | |
"grid_template_areas": null, | |
"object_position": null, | |
"object_fit": null, | |
"grid_auto_columns": null, | |
"margin": null, | |
"display": null, | |
"left": null | |
} | |
} | |
} | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/cstorm125/1db00f0540d206df687bdbc2f19c9c66/fake_berthai.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "BS1LFhUDn5Q8" | |
}, | |
"source": [ | |
"# Fake BERT in Thai with Machine Translation" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "iWUHdwhGHA6d" | |
}, | |
"source": [ | |
"RNN-based models have performed [reasonably well (sometimes better than transformers)](https://github.com/PyThaiNLP/classification-benchmarks) for simpler tasks like sequence classification (sentiment analysis, review classification, intent classification). However, with the lack of a large pretrained BERT in Thai, more complex tasks like question answering, summarization and language generation (with \"smoother\" models like GPT-2) have not been tried in Thai. \n", | |
"\n", | |
"With the advent of `scb-mt-en-th-2020` machine translation models based on `transformers-base` ([current state-of-the-art on IWSLT15](https://arxiv.org/abs/2007.03541)), we see if we can replicate the performance of pretrained English models by translating inputs and outputs to Thai before and after the models." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "hox86w2UoBXF" | |
}, | |
"source": [ | |
"## Setups" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qSIF-2WYYqMl" | |
}, | |
"source": [ | |
"Run this to set things up." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GKzs7fTqnzzH", | |
"outputId": "61124fe9-a745-4947-d8a6-412ab4969826", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 692 | |
} | |
}, | |
"source": [ | |
"#install libraries\n", | |
"!pip install -q torch pythainlp==2.1.4 sentencepiece transformers\n", | |
"!pip install git+https://github.com/pytorch/fairseq@6f6461b;" | |
], | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\u001b[K |████████████████████████████████| 11.1MB 223kB/s \n", | |
"\u001b[K |████████████████████████████████| 1.1MB 58.5MB/s \n", | |
"\u001b[K |████████████████████████████████| 1.1MB 58.6MB/s \n", | |
"\u001b[K |████████████████████████████████| 1.4MB 57.8MB/s \n", | |
"\u001b[K |████████████████████████████████| 3.0MB 47.6MB/s \n", | |
"\u001b[K |████████████████████████████████| 890kB 56.5MB/s \n", | |
"\u001b[?25h Building wheel for nltk (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
"Collecting git+https://github.com/pytorch/fairseq@6f6461b\n", | |
" Cloning https://github.com/pytorch/fairseq (to revision 6f6461b) to /tmp/pip-req-build-4nt9sfda\n", | |
" Running command git clone -q https://github.com/pytorch/fairseq /tmp/pip-req-build-4nt9sfda\n", | |
"\u001b[33m WARNING: Did not find branch or tag '6f6461b', assuming revision or ref.\u001b[0m\n", | |
" Running command git checkout -q 6f6461b\n", | |
" Running command git submodule update --init --recursive -q\n", | |
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (1.6.0+cu101)\n", | |
"Collecting sacrebleu\n", | |
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/a3/c4/8e948f601a4f9609e8b2b58f31966cb13cf17b940b82aa3e767f01c42c52/sacrebleu-1.4.14-py3-none-any.whl (64kB)\n", | |
"\u001b[K |████████████████████████████████| 71kB 3.7MB/s \n", | |
"\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (1.18.5)\n", | |
"Requirement already satisfied: regex in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (2019.12.20)\n", | |
"Requirement already satisfied: cython in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (0.29.21)\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (4.41.1)\n", | |
"Requirement already satisfied: cffi in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (1.14.3)\n", | |
"Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->fairseq==0.9.0) (0.16.0)\n", | |
"Collecting portalocker\n", | |
" Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n", | |
"Requirement already satisfied: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi->fairseq==0.9.0) (2.20)\n", | |
"Building wheels for collected packages: fairseq\n", | |
" Building wheel for fairseq (PEP 517) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for fairseq: filename=fairseq-0.9.0-cp36-cp36m-linux_x86_64.whl size=2203320 sha256=75a902ab11ef6c8476b66254f2ea8d98b038450e7181c5e561a4cf5c793a74b8\n", | |
" Stored in directory: /tmp/pip-ephem-wheel-cache-3mdfo7xn/wheels/fe/76/a0/094fc6e2fbd71b397f081787ec56944f4e7abf58436110f5fa\n", | |
"Successfully built fairseq\n", | |
"Installing collected packages: portalocker, sacrebleu, fairseq\n", | |
"Successfully installed fairseq-0.9.0 portalocker-2.0.0 sacrebleu-1.4.14\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "PNXRDg9pndz7", | |
"outputId": "ded7667a-f86e-4f6f-f369-91acfa55d158", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 997 | |
} | |
}, | |
"source": [ | |
"#download weights\n", | |
"!wget https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz\n", | |
"!wget https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz\n", | |
"!mkdir -p ./mt\n", | |
"!tar -C ./mt -xvzf SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz\n", | |
"!tar -C ./mt -xvzf SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"--2020-10-14 15:17:11-- https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz\n", | |
"Resolving github.com (github.com)... 192.30.255.112\n", | |
"Connecting to github.com (github.com)|192.30.255.112|:443... connected.\n", | |
"HTTP request sent, awaiting response... 302 Found\n", | |
"Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/14416180-b4b9-11ea-81ab-f85e212bf35b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151711Z&X-Amz-Expires=300&X-Amz-Signature=3be50f3f0cef6aa4893833a9f5904566e5f35d5a9fce09a216ff13e6bc1b6bff&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream [following]\n", | |
"--2020-10-14 15:17:11-- https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/14416180-b4b9-11ea-81ab-f85e212bf35b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151711Z&X-Amz-Expires=300&X-Amz-Signature=3be50f3f0cef6aa4893833a9f5904566e5f35d5a9fce09a216ff13e6bc1b6bff&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream\n", | |
"Resolving github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)... 52.216.97.219\n", | |
"Connecting to github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)|52.216.97.219|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 699200099 (667M) [application/octet-stream]\n", | |
"Saving to: ‘SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz’\n", | |
"\n", | |
"SCB_1M+TBASE_th-en_ 100%[===================>] 666.81M 38.5MB/s in 15s \n", | |
"\n", | |
"2020-10-14 15:17:27 (43.6 MB/s) - ‘SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz’ saved [699200099/699200099]\n", | |
"\n", | |
"--2020-10-14 15:17:27-- https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz\n", | |
"Resolving github.com (github.com)... 192.30.255.112\n", | |
"Connecting to github.com (github.com)|192.30.255.112|:443... connected.\n", | |
"HTTP request sent, awaiting response... 302 Found\n", | |
"Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/c88cb900-b4b4-11ea-9b7f-294881c2ca03?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151727Z&X-Amz-Expires=300&X-Amz-Signature=5c3e61f98f6a21f7c4ab8136a3f16e300ae2c55e6cf6fa31edb5036a72e2fff6&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream [following]\n", | |
"--2020-10-14 15:17:27-- https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/c88cb900-b4b4-11ea-9b7f-294881c2ca03?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151727Z&X-Amz-Expires=300&X-Amz-Signature=5c3e61f98f6a21f7c4ab8136a3f16e300ae2c55e6cf6fa31edb5036a72e2fff6&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream\n", | |
"Resolving github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)... 52.216.109.115\n", | |
"Connecting to github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)|52.216.109.115|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 699080820 (667M) [application/octet-stream]\n", | |
"Saving to: ‘SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz’\n", | |
"\n", | |
"SCB_1M+TBASE_en-th_ 100%[===================>] 666.70M 47.0MB/s in 15s \n", | |
"\n", | |
"2020-10-14 15:17:42 (45.1 MB/s) - ‘SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz’ saved [699080820/699080820]\n", | |
"\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/models/\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/dict.th.txt\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/dict.en.txt\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/models/checkpoint.pt\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.vocab\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.en.model\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model\n", | |
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.en.vocab\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/models/\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/dict.th.txt\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/dict.en.txt\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/models/checkpoint.pt\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.th.vocab\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.en.model\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.th.model\n", | |
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.en.vocab\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RuaOjlUbnibp" | |
}, | |
"source": [ | |
"from __future__ import print_function\n", | |
"from ipywidgets import interact, interactive, fixed, interact_manual\n", | |
"import ipywidgets as widgets\n", | |
"import textwrap\n", | |
"\n", | |
"import torch\n", | |
"from fairseq.models.transformer import TransformerModel" | |
], | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "psuL88nuoanq" | |
}, | |
"source": [ | |
"## `scb-mt-en-th-2020` Machine Translation Models" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "RpKwC5JUYtGi" | |
}, | |
"source": [ | |
"Try out the machine translation models. This is not the best version but they will do reasonably well." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "rYdLExGdmjOn" | |
}, | |
"source": [ | |
"en2th_spm = TransformerModel.from_pretrained(\n", | |
" model_name_or_path='mt/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/models/',\n", | |
" checkpoint_file='checkpoint.pt',\n", | |
" data_name_or_path='mt/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/',\n", | |
" bpe='sentencepiece',\n", | |
" sentencepiece_vocab='mt/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.en.model'\n", | |
")\n", | |
"\n", | |
"th2en_spm = TransformerModel.from_pretrained(\n", | |
" model_name_or_path='mt/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/models/',\n", | |
" checkpoint_file='checkpoint.pt',\n", | |
" data_name_or_path='mt/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/',\n", | |
" bpe='sentencepiece',\n", | |
" sentencepiece_vocab='mt/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model'\n", | |
" )" | |
], | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ICPdoTM6zcvl", | |
"cellView": "form", | |
"outputId": "d7d327d7-3e16-47df-d5ae-e3c85d5f79f0", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
} | |
}, | |
"source": [ | |
"#@title English to Thai Translation\n", | |
"input_sentence = 'Covid-19 has infected more than 3 million people and killed at least 210,000 worldwide, according to Johns Hopkins University.' #@param {type:\"string\"}\n", | |
"\n", | |
"hypothesis = en2th_spm.translate(input_sentence)\n", | |
"\n", | |
"print('input_sentence:', input_sentence)\n", | |
"print('translation :', hypothesis)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"input_sentence: Covid-19 has infected more than 3 million people and killed at least 210,000 worldwide, according to Johns Hopkins University.\n", | |
"translation : โควิด-19 ได้ติดเชื้อมากกว่า 3 ล้านคนและสังหารอย่างน้อย 210,000 คนทั่วโลก จากการรายงานของมหาวิทยาลัยจอห์นส์ฮอปกินส์\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zAHi6NcQnT9h", | |
"cellView": "form", | |
"outputId": "f3a2b2e2-1f15-42f9-8d40-900882c1f3d1", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
} | |
}, | |
"source": [ | |
"#@title English to Thai Translation\n", | |
"input_sentence = 'ราคาเอทิลีนและโพรพิลีนในตลาดเอเชียไตรมาส 1 ปี 2560 เฉลี่ยอยู่ที่ 38.28 และ 32.81 บาท/กิโลกรัม ตามลำดับ' #@param {type:\"string\"}\n", | |
"\n", | |
"hypothesis = th2en_spm.translate(input_sentence)\n", | |
"\n", | |
"print('input_sentence:', input_sentence)\n", | |
"print('translation :', hypothesis)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"input_sentence: ราคาเอทิลีนและโพรพิลีนในตลาดเอเชียไตรมาส 1 ปี 2560 เฉลี่ยอยู่ที่ 38.28 และ 32.81 บาท/กิโลกรัม ตามลำดับ\n", | |
"translation : Ethylene and polypropylene prices in the first quarter of 2017 averaged 38.28 and 32.81 baht/kg, respectively.\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "HhpjXQIqppDQ" | |
}, | |
"source": [ | |
"## BERT Whole-Word Masking Question Answering" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "PxQVJqzSYz1L" | |
}, | |
"source": [ | |
"We first try question answering using BERT trained with whole-word masking. Put the context in `text_th` and then enter your questions in the cell below." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RyoNKYK34JF6", | |
"outputId": "0ec477e3-e02e-4822-f842-123210b91e3d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 166, | |
"referenced_widgets": [ | |
"5763d9c666104798b9142f7930feb082", | |
"bb5d731d4dd7459cb4cff26826981941", | |
"361718c1529e4752ba39be680d113ea5", | |
"887b453cff9047a7a3a3bd2ab770b706", | |
"5665589f88e34f84a77a13dffe4b1c6d", | |
"c17c3e07f0224529a97fef31bbdc11ad", | |
"b693309e4f7b43f18ab854f862cccdb4", | |
"9c46eaeee1fa4f1fb5dde626cdd04aa5", | |
"f9edfdc8fb514f23ba55610df90314d2", | |
"b8d20cc2a1dc419f91c17fc51cf3aa3d", | |
"76900b67fe0448bc96ab847f01494eb7", | |
"02226a523d904b00b438a51c2167280b", | |
"f8fd175a2e0c41a2a75b93b50d14eb5e", | |
"7ed384326c894a859f4a29aef801baf5", | |
"2a7cd115b9604a118fa61064ffca081f", | |
"169db29a65624951ae84c016ca077689", | |
"3a85c7ca089048c481decb7f0fab19e8", | |
"e0892ebb37cf4972b5b304303bf3848b", | |
"47c174ee43094cea811a017228238278", | |
"7d7fe817759a4034af8adafd25719010", | |
"b1fd665f566f409a8c7e74e590034d7a", | |
"a94ce761446b47e5bd55ece5435efa18", | |
"01c732c5b9fb41d7b78a1b457711cd2d", | |
"75f56137297445939d8fddfcf2e2209a" | |
] | |
} | |
}, | |
"source": [ | |
" from transformers import AutoTokenizer, AutoModelForQuestionAnswering\n", | |
"\n", | |
"tokenizer = AutoTokenizer.from_pretrained(\"bert-large-uncased-whole-word-masking-finetuned-squad\")\n", | |
"model = AutoModelForQuestionAnswering.from_pretrained(\"bert-large-uncased-whole-word-masking-finetuned-squad\")" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "5763d9c666104798b9142f7930feb082", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "f9edfdc8fb514f23ba55610df90314d2", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "3a85c7ca089048c481decb7f0fab19e8", | |
"version_minor": 0, | |
"version_major": 2 | |
}, | |
"text/plain": [ | |
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bl-LuZdRFr5s" | |
}, | |
"source": [ | |
"#context article\n", | |
"text_th = r\"\"\" \n", | |
"สโมสรฟุตบอลแมนเชสเตอร์ยูไนเต็ดเป็นสโมสรฟุตบอลตั้งอยู่ที่โอลด์แทรฟฟอร์ดในเกรเทอร์แมนเชสเตอร์ ประเทศอังกฤษ \n", | |
"ปัจจุบันแข่งขันในพรีเมียร์ลีกซึ่งเป็นลีกสูงสุดของฟุตบอลอังกฤษ สโมสรมีฉายา \"ปีศาจแดง\" ก่อตั้งในชื่อสโมสรฟุตบอลนิวตันฮีตแอลวายอาร์ใน ค.ศ. 1878 \n", | |
"ต่อมาเปลี่ยนชื่อเป็นแมนเชสเตอร์ยูไนเต็ดใน ค.ศ. 1902 และย้ายไปเล่นที่สนามเหย้าปัจจุบันอย่างโอลด์แทรฟฟอร์ดใน ค.ศ. 1910\n", | |
"\"\"\" \n", | |
"text_en = th2en_spm.translate(text_th)" | |
], | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "dI94wLAFnX65", | |
"cellView": "form", | |
"outputId": "a718ccfc-2b0d-4742-b1f0-9704feaf52b7", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 124 | |
} | |
}, | |
"source": [ | |
"#@title Question Answering\n", | |
"question_th_1 = \"\\u0E41\\u0E21\\u0E19\\u0E40\\u0E0A\\u0E2A\\u0E40\\u0E15\\u0E2D\\u0E23\\u0E4C\\u0E22\\u0E39\\u0E44\\u0E19\\u0E40\\u0E15\\u0E47\\u0E14\\u0E04\\u0E37\\u0E2D\\u0E2D\\u0E30\\u0E44\\u0E23\" #@param {type:\"string\"}\n", | |
"question_th_2 = \"\\u0E2A\\u0E19\\u0E32\\u0E21\\u0E40\\u0E2B\\u0E22\\u0E49\\u0E32\\u0E02\\u0E2D\\u0E07\\u0E41\\u0E21\\u0E19\\u0E22\\u0E39\\u0E2D\\u0E22\\u0E39\\u0E48\\u0E17\\u0E35\\u0E48\\u0E44\\u0E2B\\u0E19\" #@param {type:\"string\"}\n", | |
"question_th_3 = \"\\u0E01\\u0E48\\u0E2D\\u0E19\\u0E1B\\u0E35 1902 \\u0E41\\u0E21\\u0E19\\u0E40\\u0E0A\\u0E2A\\u0E40\\u0E15\\u0E2D\\u0E23\\u0E4C\\u0E22\\u0E39\\u0E44\\u0E19\\u0E40\\u0E15\\u0E47\\u0E14\\u0E0A\\u0E37\\u0E48\\u0E2D\\u0E2D\\u0E30\\u0E44\\u0E23\" #@param {type:\"string\"}\n", | |
"questions_th =[question_th_1,question_th_2,question_th_3]\n", | |
"questions_en = [th2en_spm.translate(i) for i in questions_th]\n", | |
"\n", | |
"for i in range(len(questions_en)):\n", | |
" question_en = questions_en[i]\n", | |
" question_th = questions_th[i]\n", | |
" inputs = tokenizer(question_en, text_en, add_special_tokens=True, return_tensors=\"pt\")\n", | |
" input_ids = inputs[\"input_ids\"].tolist()[0]\n", | |
" text_tokens = tokenizer.convert_ids_to_tokens(input_ids)\n", | |
" answer_start_scores, answer_end_scores = model(**inputs)\n", | |
" answer_start = torch.argmax(answer_start_scores) # Get the most likely beginning of answer with the argmax of the score\n", | |
" answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score\n", | |
" answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))\n", | |
" # answer = f'{answer}.' if answer[-1]!='.' else answer\n", | |
" print(f\"Question: {question_th}\")\n", | |
" print(f\"Answer: {en2th_spm.translate(answer)} (from {answer_start} to {answer_end})\")" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Question: แมนเชสเตอร์ยูไนเต็ดคืออะไร\n", | |
"Answer: สโมสรฟุตบอล (from 10 to 13)\n", | |
"Question: สนามเหย้าของแมนยูอยู่ที่ไหน\n", | |
"Answer: โอลด์ แทรฟฟอร์ดค่ะ (from 20 to 22)\n", | |
"Question: ก่อนปี 1902 แมนเชสเตอร์ยูไนเต็ดชื่ออะไร\n", | |
"Answer: เรดเดวิลค่ะ (from 47 to 50)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "D2ExOLGwtVRq" | |
}, | |
"source": [ | |
"## T5 Summarization" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "SGjITDJVZA9q" | |
}, | |
"source": [ | |
"You can also try summarization by entering the text to summarize in `article_th`. Tune the max length of the summarization in the cell below." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "rvB4p84SHSoR" | |
}, | |
"source": [ | |
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", | |
"\n", | |
"model = AutoModelForSeq2SeqLM.from_pretrained(\"t5-base\")\n", | |
"tokenizer = AutoTokenizer.from_pretrained(\"t5-base\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BnZGJ6X4HekL" | |
}, | |
"source": [ | |
"#text to summarize\n", | |
"article_th = '''บริษัท สรรพสินค้าเซ็นทรัล จำกัด หรือ ห้างสรรพสินค้าเซ็นทรัล เป็นห้างสรรพสินค้าเก่าแก่แห่งหนึ่งของประเทศไทย \n", | |
"ก่อตั้งโดย เตียง จิราธิวัฒน์ เริ่มดำเนินการเมื่อปี พ.ศ. 2490 ด้วยการเป็นร้านขายของชำเล็กๆ \n", | |
"ก่อนที่สัมฤทธิ์ บุตรชาย จะเริ่มดำเนินการห้างสรรพสินค้าเซ็นทรัล สาขาแรกเมื่อ พ.ศ. 2499 ที่ย่านวังบูรพา \n", | |
"และกลายมาเป็นห้างสรรพสินค้าชั้นนำของไทยในปัจจุบัน\n", | |
"'''\n", | |
"article_en = th2en_spm.translate(article_th)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ZSS4mHKXtUTH", | |
"cellView": "form", | |
"outputId": "f1a56b4e-1198-4ce5-8de5-61e455920012", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 124 | |
} | |
}, | |
"source": [ | |
"max_length = 200 #@param {type:\"slider\", min:150, max:300, step:50}\n", | |
"\n", | |
"# T5 uses a max_length of 512 so we cut the article to 512 tokens.\n", | |
"inputs = tokenizer.encode(\"summarize: \" + article_en, return_tensors=\"pt\", \n", | |
" max_length=512,truncation=True)\n", | |
"outputs = model.generate(inputs, max_length=max_length, min_length=50, \n", | |
" length_penalty=2.0, num_beams=4, early_stopping=True)\n", | |
"summary_en = [tokenizer.decode(i) for i in outputs][0]\n", | |
"summary_th = en2th_spm.translate(summary_en)\n", | |
"\n", | |
"print('Summary:')\n", | |
"textwrap.wrap(summary_th)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Summary:\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['เซ็นทรัล โค้ท (Central Co. หรือ Central)',\n", | |
" 'เป็นหนึ่งในห้างสรรพสินค้าที่เก่าแก่ที่สุดของประเทศไทย',\n", | |
" 'ก่อตั้งโดยเตียงจิราตีวัตต์ (Jirathiwatt) ซึ่งเริ่มมีร้านค้าขนาดเล็ก',\n", | |
" 'ในปี พ.ศ. 2490 ลูกชายเริ่มดําเนินการขายห้างสรรพสินค้าแห่งแรกใน พ.ศ.',\n", | |
" '2499']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 82 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "HP6glRK3JrmU" | |
}, | |
"source": [ | |
"## GPT-2 Text Generation" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "p8UgFYX6ZKCE" | |
}, | |
"source": [ | |
"Enter a prompt and generate texts using GPT-2." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "q6hLinXZSmVv" | |
}, | |
"source": [ | |
"from transformers import AutoModelForCausalLM, AutoTokenizer\n", | |
"\n", | |
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", | |
"model = AutoModelForCausalLM.from_pretrained(\"gpt2\", pad_token_id=tokenizer.eos_token_id)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "l5Pg9bf6V--u", | |
"cellView": "form", | |
"outputId": "43c0045e-d529-4576-91c7-961b621d4548", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 319 | |
} | |
}, | |
"source": [ | |
"#@title Generate Texts\n", | |
"prompt_th = \"กาลครั้งหนึ่งนานมาแล้ว \" #@param {type:\"string\"}\n", | |
"max_length = 300 #@param {type:\"slider\", min:250, max:500, step:50}\n", | |
"prompt_en = th2en_spm.translate(prompt_th)\n", | |
"inputs = tokenizer.encode(prompt_en, \n", | |
" add_special_tokens=False, \n", | |
" return_tensors=\"pt\")\n", | |
"prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, \n", | |
" clean_up_tokenization_spaces=True))\n", | |
"outputs = model.generate(inputs, max_length=max_length, do_sample=True, \n", | |
" top_p=0.95, top_k=60)\n", | |
"generated = prompt_th + en2th_spm.translate(tokenizer.decode(outputs[0])[prompt_length:])\n", | |
"textwrap.wrap(generated)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"['กาลครั้งหนึ่งนานมาแล้ว เหตุผลที่ฉันคิดว่ามันสําคัญมากสําหรับบล็อกนี้ที',\n", | |
" '่จะสะท้อนให้เห็นถึงข้อเท็จจริงที่เรียบง่ายของชีวิตและสถานการณ์ชีวิตแทน',\n", | |
" 'ที่จะเป็นความคิดที่เป็นนามธรรมว่าชีวิตเป็นอย่างไรตอนนี้เป็นเพราะพวกเขา',\n", | |
" 'เป็นเรื่องราวของชีวิตอย่างที่มันทํา, เวลาที่สมองของเราทํางานเพื่อให้รู',\n", | |
" '้สึกถึงทุกสิ่งที่เราประสบและเพื่อนําทางเราในทางที่มีความหมายบางอย่าง',\n", | |
" 'ชีวิตไม่เพียง แต่แตกต่างกันเท่านั้น, มันซับซ้อน,',\n", | |
" 'และบางครั้งคุณใช้ชีวิตและกิจกรรมของสมองในชีวิตซึ่งอารมณ์อื่น ๆ ทั้งหมด',\n", | |
" '- อารมณ์ของความเจ็บปวด, ความทุกข์ทรมาน, ความหงุดหงิด, ความเครียด -',\n", | |
" 'เล่นบทบาทขนาดใหญ่. บางครั้งเรามีประสบการณ์ชีวิตเล็กน้อยเมื่อเราอาศัยอย',\n", | |
" 'ู่ในช่วงเวลาที่ยากลําบากที่สุดเพราะเหตุการณ์ที่น่าเศร้าบางอย่าง',\n", | |
" '(เช่นอุบัติเหตุรถชนหรือความตายแม้ว่าเรามีประสบการณ์เหมือนกัน)',\n", | |
" 'หรือแม้ว่าเราจะไม่มีแผนที่ถูกต้องในการจัดการกับสถานการณ์เหล่านั้น. หรื',\n", | |
" 'อบางทีเราอาจมีความเจ็บป่วยทางจิตที่ทําให้เราตกใจอย่างถาวรหรือแม้กระทั่',\n", | |
" 'งความเศร้าโศกและมันเป็นเหตุผลที่เรามีความพิการทางจิตหรือหนึ่งในความผิด',\n", | |
" 'ปกติที่เราเคยอยู่ในอารมณ์ที่ดี.',\n", | |
" 'เรามักจะพบว่ามันยากที่จะมุ่งเน้นที่ทั้งหมด,',\n", | |
" 'เพื่อให้คิดต่อไปว่าทั้งหมดนี้จะพาเรา']" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 77 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "uohWti2YW3iT" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment