Skip to content

Instantly share code, notes, and snippets.

@kittinan
Forked from cstorm125/fake_berthai.ipynb
Created October 14, 2020 16:21
Show Gist options
  • Save kittinan/143f13c5c9108710c5c56d6b7981e2ac to your computer and use it in GitHub Desktop.
Save kittinan/143f13c5c9108710c5c56d6b7981e2ac to your computer and use it in GitHub Desktop.
fake_berthai.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "fake_berthai.ipynb",
"provenance": [],
"collapsed_sections": [
"psuL88nuoanq"
],
"authorship_tag": "ABX9TyPua2WUeSK7Orlb0u/RqH4I",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"5763d9c666104798b9142f7930feb082": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_bb5d731d4dd7459cb4cff26826981941",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_361718c1529e4752ba39be680d113ea5",
"IPY_MODEL_887b453cff9047a7a3a3bd2ab770b706"
]
}
},
"bb5d731d4dd7459cb4cff26826981941": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"361718c1529e4752ba39be680d113ea5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_5665589f88e34f84a77a13dffe4b1c6d",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 443,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 443,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c17c3e07f0224529a97fef31bbdc11ad"
}
},
"887b453cff9047a7a3a3bd2ab770b706": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_b693309e4f7b43f18ab854f862cccdb4",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 443/443 [00:00<00:00, 756B/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_9c46eaeee1fa4f1fb5dde626cdd04aa5"
}
},
"5665589f88e34f84a77a13dffe4b1c6d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"c17c3e07f0224529a97fef31bbdc11ad": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b693309e4f7b43f18ab854f862cccdb4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"9c46eaeee1fa4f1fb5dde626cdd04aa5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f9edfdc8fb514f23ba55610df90314d2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_b8d20cc2a1dc419f91c17fc51cf3aa3d",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_76900b67fe0448bc96ab847f01494eb7",
"IPY_MODEL_02226a523d904b00b438a51c2167280b"
]
}
},
"b8d20cc2a1dc419f91c17fc51cf3aa3d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"76900b67fe0448bc96ab847f01494eb7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_f8fd175a2e0c41a2a75b93b50d14eb5e",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 231508,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 231508,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_7ed384326c894a859f4a29aef801baf5"
}
},
"02226a523d904b00b438a51c2167280b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_2a7cd115b9604a118fa61064ffca081f",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 232k/232k [00:00<00:00, 856kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_169db29a65624951ae84c016ca077689"
}
},
"f8fd175a2e0c41a2a75b93b50d14eb5e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"7ed384326c894a859f4a29aef801baf5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"2a7cd115b9604a118fa61064ffca081f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"169db29a65624951ae84c016ca077689": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"3a85c7ca089048c481decb7f0fab19e8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_e0892ebb37cf4972b5b304303bf3848b",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_47c174ee43094cea811a017228238278",
"IPY_MODEL_7d7fe817759a4034af8adafd25719010"
]
}
},
"e0892ebb37cf4972b5b304303bf3848b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"47c174ee43094cea811a017228238278": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_b1fd665f566f409a8c7e74e590034d7a",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 1340675298,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 1340675298,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_a94ce761446b47e5bd55ece5435efa18"
}
},
"7d7fe817759a4034af8adafd25719010": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_01c732c5b9fb41d7b78a1b457711cd2d",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 1.34G/1.34G [00:28<00:00, 47.0MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_75f56137297445939d8fddfcf2e2209a"
}
},
"b1fd665f566f409a8c7e74e590034d7a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"a94ce761446b47e5bd55ece5435efa18": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"01c732c5b9fb41d7b78a1b457711cd2d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"75f56137297445939d8fddfcf2e2209a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/cstorm125/1db00f0540d206df687bdbc2f19c9c66/fake_berthai.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BS1LFhUDn5Q8"
},
"source": [
"# Fake BERT in Thai with Machine Translation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "iWUHdwhGHA6d"
},
"source": [
"RNN-based models have performed [reasonably well (sometimes better than transformers)](https://github.com/PyThaiNLP/classification-benchmarks) for simpler tasks like sequence classification (sentiment analysis, review classification, intent classification). However, with the lack of a large pretrained BERT in Thai, more complex tasks like question answering, summarization and language generation (with \"smoother\" models like GPT-2) have not been tried in Thai. \n",
"\n",
"With the advent of `scb-mt-en-th-2020` machine translation models based on `transformers-base` ([current state-of-the-art on IWSLT15](https://arxiv.org/abs/2007.03541)), we see if we can replicate the performance of pretrained English models by translating inputs and outputs to Thai before and after the models."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hox86w2UoBXF"
},
"source": [
"## Setups"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qSIF-2WYYqMl"
},
"source": [
"Run this to set things up."
]
},
{
"cell_type": "code",
"metadata": {
"id": "GKzs7fTqnzzH",
"outputId": "61124fe9-a745-4947-d8a6-412ab4969826",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 692
}
},
"source": [
"#install libraries\n",
"!pip install -q torch pythainlp==2.1.4 sentencepiece transformers\n",
"!pip install git+https://github.com/pytorch/fairseq@6f6461b;"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"\u001b[K |████████████████████████████████| 11.1MB 223kB/s \n",
"\u001b[K |████████████████████████████████| 1.1MB 58.5MB/s \n",
"\u001b[K |████████████████████████████████| 1.1MB 58.6MB/s \n",
"\u001b[K |████████████████████████████████| 1.4MB 57.8MB/s \n",
"\u001b[K |████████████████████████████████| 3.0MB 47.6MB/s \n",
"\u001b[K |████████████████████████████████| 890kB 56.5MB/s \n",
"\u001b[?25h Building wheel for nltk (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Collecting git+https://github.com/pytorch/fairseq@6f6461b\n",
" Cloning https://github.com/pytorch/fairseq (to revision 6f6461b) to /tmp/pip-req-build-4nt9sfda\n",
" Running command git clone -q https://github.com/pytorch/fairseq /tmp/pip-req-build-4nt9sfda\n",
"\u001b[33m WARNING: Did not find branch or tag '6f6461b', assuming revision or ref.\u001b[0m\n",
" Running command git checkout -q 6f6461b\n",
" Running command git submodule update --init --recursive -q\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (1.6.0+cu101)\n",
"Collecting sacrebleu\n",
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/a3/c4/8e948f601a4f9609e8b2b58f31966cb13cf17b940b82aa3e767f01c42c52/sacrebleu-1.4.14-py3-none-any.whl (64kB)\n",
"\u001b[K |████████████████████████████████| 71kB 3.7MB/s \n",
"\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (1.18.5)\n",
"Requirement already satisfied: regex in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (2019.12.20)\n",
"Requirement already satisfied: cython in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (0.29.21)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (4.41.1)\n",
"Requirement already satisfied: cffi in /usr/local/lib/python3.6/dist-packages (from fairseq==0.9.0) (1.14.3)\n",
"Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->fairseq==0.9.0) (0.16.0)\n",
"Collecting portalocker\n",
" Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.6/dist-packages (from cffi->fairseq==0.9.0) (2.20)\n",
"Building wheels for collected packages: fairseq\n",
" Building wheel for fairseq (PEP 517) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for fairseq: filename=fairseq-0.9.0-cp36-cp36m-linux_x86_64.whl size=2203320 sha256=75a902ab11ef6c8476b66254f2ea8d98b038450e7181c5e561a4cf5c793a74b8\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-3mdfo7xn/wheels/fe/76/a0/094fc6e2fbd71b397f081787ec56944f4e7abf58436110f5fa\n",
"Successfully built fairseq\n",
"Installing collected packages: portalocker, sacrebleu, fairseq\n",
"Successfully installed fairseq-0.9.0 portalocker-2.0.0 sacrebleu-1.4.14\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "PNXRDg9pndz7",
"outputId": "ded7667a-f86e-4f6f-f369-91acfa55d158",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 997
}
},
"source": [
"#download weights\n",
"!wget https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz\n",
"!wget https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz\n",
"!mkdir -p ./mt\n",
"!tar -C ./mt -xvzf SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz\n",
"!tar -C ./mt -xvzf SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-10-14 15:17:11-- https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz\n",
"Resolving github.com (github.com)... 192.30.255.112\n",
"Connecting to github.com (github.com)|192.30.255.112|:443... connected.\n",
"HTTP request sent, awaiting response... 302 Found\n",
"Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/14416180-b4b9-11ea-81ab-f85e212bf35b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151711Z&X-Amz-Expires=300&X-Amz-Signature=3be50f3f0cef6aa4893833a9f5904566e5f35d5a9fce09a216ff13e6bc1b6bff&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream [following]\n",
"--2020-10-14 15:17:11-- https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/14416180-b4b9-11ea-81ab-f85e212bf35b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151711Z&X-Amz-Expires=300&X-Amz-Signature=3be50f3f0cef6aa4893833a9f5904566e5f35d5a9fce09a216ff13e6bc1b6bff&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream\n",
"Resolving github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)... 52.216.97.219\n",
"Connecting to github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)|52.216.97.219|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 699200099 (667M) [application/octet-stream]\n",
"Saving to: ‘SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz’\n",
"\n",
"SCB_1M+TBASE_th-en_ 100%[===================>] 666.81M 38.5MB/s in 15s \n",
"\n",
"2020-10-14 15:17:27 (43.6 MB/s) - ‘SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0.tar.gz’ saved [699200099/699200099]\n",
"\n",
"--2020-10-14 15:17:27-- https://github.com/vistec-AI/model-releases/releases/download/SCB_1M%2BTBASE_v1.0/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz\n",
"Resolving github.com (github.com)... 192.30.255.112\n",
"Connecting to github.com (github.com)|192.30.255.112|:443... connected.\n",
"HTTP request sent, awaiting response... 302 Found\n",
"Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/c88cb900-b4b4-11ea-9b7f-294881c2ca03?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151727Z&X-Amz-Expires=300&X-Amz-Signature=5c3e61f98f6a21f7c4ab8136a3f16e300ae2c55e6cf6fa31edb5036a72e2fff6&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream [following]\n",
"--2020-10-14 15:17:27-- https://github-production-release-asset-2e65be.s3.amazonaws.com/272403533/c88cb900-b4b4-11ea-9b7f-294881c2ca03?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20201014%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201014T151727Z&X-Amz-Expires=300&X-Amz-Signature=5c3e61f98f6a21f7c4ab8136a3f16e300ae2c55e6cf6fa31edb5036a72e2fff6&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=272403533&response-content-disposition=attachment%3B%20filename%3DSCB_1M%2BTBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz&response-content-type=application%2Foctet-stream\n",
"Resolving github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)... 52.216.109.115\n",
"Connecting to github-production-release-asset-2e65be.s3.amazonaws.com (github-production-release-asset-2e65be.s3.amazonaws.com)|52.216.109.115|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 699080820 (667M) [application/octet-stream]\n",
"Saving to: ‘SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz’\n",
"\n",
"SCB_1M+TBASE_en-th_ 100%[===================>] 666.70M 47.0MB/s in 15s \n",
"\n",
"2020-10-14 15:17:42 (45.1 MB/s) - ‘SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0.tar.gz’ saved [699080820/699080820]\n",
"\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/models/\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/dict.th.txt\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/dict.en.txt\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/models/checkpoint.pt\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.vocab\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.en.model\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model\n",
"SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.en.vocab\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/models/\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/dict.th.txt\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/dict.en.txt\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/models/checkpoint.pt\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.th.vocab\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.en.model\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.th.model\n",
"SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.en.vocab\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RuaOjlUbnibp"
},
"source": [
"from __future__ import print_function\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"import textwrap\n",
"\n",
"import torch\n",
"from fairseq.models.transformer import TransformerModel"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "psuL88nuoanq"
},
"source": [
"## `scb-mt-en-th-2020` Machine Translation Models"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "RpKwC5JUYtGi"
},
"source": [
"Try out the machine translation models. This is not the best version but they will do reasonably well."
]
},
{
"cell_type": "code",
"metadata": {
"id": "rYdLExGdmjOn"
},
"source": [
"en2th_spm = TransformerModel.from_pretrained(\n",
" model_name_or_path='mt/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/models/',\n",
" checkpoint_file='checkpoint.pt',\n",
" data_name_or_path='mt/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/vocab/',\n",
" bpe='sentencepiece',\n",
" sentencepiece_vocab='mt/SCB_1M+TBASE_en-th_spm-spm_32000-joined_v1.0/bpe/spm.en.model'\n",
")\n",
"\n",
"th2en_spm = TransformerModel.from_pretrained(\n",
" model_name_or_path='mt/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/models/',\n",
" checkpoint_file='checkpoint.pt',\n",
" data_name_or_path='mt/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/vocab/',\n",
" bpe='sentencepiece',\n",
" sentencepiece_vocab='mt/SCB_1M+TBASE_th-en_spm-spm_32000-joined_v1.0/bpe/spm.th.model'\n",
" )"
],
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ICPdoTM6zcvl",
"cellView": "form",
"outputId": "d7d327d7-3e16-47df-d5ae-e3c85d5f79f0",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
}
},
"source": [
"#@title English to Thai Translation\n",
"input_sentence = 'Covid-19 has infected more than 3 million people and killed at least 210,000 worldwide, according to Johns Hopkins University.' #@param {type:\"string\"}\n",
"\n",
"hypothesis = en2th_spm.translate(input_sentence)\n",
"\n",
"print('input_sentence:', input_sentence)\n",
"print('translation :', hypothesis)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"input_sentence: Covid-19 has infected more than 3 million people and killed at least 210,000 worldwide, according to Johns Hopkins University.\n",
"translation : โควิด-19 ได้ติดเชื้อมากกว่า 3 ล้านคนและสังหารอย่างน้อย 210,000 คนทั่วโลก จากการรายงานของมหาวิทยาลัยจอห์นส์ฮอปกินส์\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "zAHi6NcQnT9h",
"cellView": "form",
"outputId": "f3a2b2e2-1f15-42f9-8d40-900882c1f3d1",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
}
},
"source": [
"#@title English to Thai Translation\n",
"input_sentence = 'ราคาเอทิลีนและโพรพิลีนในตลาดเอเชียไตรมาส 1 ปี 2560 เฉลี่ยอยู่ที่ 38.28 และ 32.81 บาท/กิโลกรัม ตามลำดับ' #@param {type:\"string\"}\n",
"\n",
"hypothesis = th2en_spm.translate(input_sentence)\n",
"\n",
"print('input_sentence:', input_sentence)\n",
"print('translation :', hypothesis)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"input_sentence: ราคาเอทิลีนและโพรพิลีนในตลาดเอเชียไตรมาส 1 ปี 2560 เฉลี่ยอยู่ที่ 38.28 และ 32.81 บาท/กิโลกรัม ตามลำดับ\n",
"translation : Ethylene and polypropylene prices in the first quarter of 2017 averaged 38.28 and 32.81 baht/kg, respectively.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HhpjXQIqppDQ"
},
"source": [
"## BERT Whole-Word Masking Question Answering"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "PxQVJqzSYz1L"
},
"source": [
"We first try question answering using BERT trained with whole-word masking. Put the context in `text_th` and then enter your questions in the cell below."
]
},
{
"cell_type": "code",
"metadata": {
"id": "RyoNKYK34JF6",
"outputId": "0ec477e3-e02e-4822-f842-123210b91e3d",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 166,
"referenced_widgets": [
"5763d9c666104798b9142f7930feb082",
"bb5d731d4dd7459cb4cff26826981941",
"361718c1529e4752ba39be680d113ea5",
"887b453cff9047a7a3a3bd2ab770b706",
"5665589f88e34f84a77a13dffe4b1c6d",
"c17c3e07f0224529a97fef31bbdc11ad",
"b693309e4f7b43f18ab854f862cccdb4",
"9c46eaeee1fa4f1fb5dde626cdd04aa5",
"f9edfdc8fb514f23ba55610df90314d2",
"b8d20cc2a1dc419f91c17fc51cf3aa3d",
"76900b67fe0448bc96ab847f01494eb7",
"02226a523d904b00b438a51c2167280b",
"f8fd175a2e0c41a2a75b93b50d14eb5e",
"7ed384326c894a859f4a29aef801baf5",
"2a7cd115b9604a118fa61064ffca081f",
"169db29a65624951ae84c016ca077689",
"3a85c7ca089048c481decb7f0fab19e8",
"e0892ebb37cf4972b5b304303bf3848b",
"47c174ee43094cea811a017228238278",
"7d7fe817759a4034af8adafd25719010",
"b1fd665f566f409a8c7e74e590034d7a",
"a94ce761446b47e5bd55ece5435efa18",
"01c732c5b9fb41d7b78a1b457711cd2d",
"75f56137297445939d8fddfcf2e2209a"
]
}
},
"source": [
" from transformers import AutoTokenizer, AutoModelForQuestionAnswering\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"bert-large-uncased-whole-word-masking-finetuned-squad\")\n",
"model = AutoModelForQuestionAnswering.from_pretrained(\"bert-large-uncased-whole-word-masking-finetuned-squad\")"
],
"execution_count": 5,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5763d9c666104798b9142f7930feb082",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f9edfdc8fb514f23ba55610df90314d2",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3a85c7ca089048c481decb7f0fab19e8",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "bl-LuZdRFr5s"
},
"source": [
"#context article\n",
"text_th = r\"\"\" \n",
"สโมสรฟุตบอลแมนเชสเตอร์ยูไนเต็ดเป็นสโมสรฟุตบอลตั้งอยู่ที่โอลด์แทรฟฟอร์ดในเกรเทอร์แมนเชสเตอร์ ประเทศอังกฤษ \n",
"ปัจจุบันแข่งขันในพรีเมียร์ลีกซึ่งเป็นลีกสูงสุดของฟุตบอลอังกฤษ สโมสรมีฉายา \"ปีศาจแดง\" ก่อตั้งในชื่อสโมสรฟุตบอลนิวตันฮีตแอลวายอาร์ใน ค.ศ. 1878 \n",
"ต่อมาเปลี่ยนชื่อเป็นแมนเชสเตอร์ยูไนเต็ดใน ค.ศ. 1902 และย้ายไปเล่นที่สนามเหย้าปัจจุบันอย่างโอลด์แทรฟฟอร์ดใน ค.ศ. 1910\n",
"\"\"\" \n",
"text_en = th2en_spm.translate(text_th)"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dI94wLAFnX65",
"cellView": "form",
"outputId": "a718ccfc-2b0d-4742-b1f0-9704feaf52b7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 124
}
},
"source": [
"#@title Question Answering\n",
"question_th_1 = \"\\u0E41\\u0E21\\u0E19\\u0E40\\u0E0A\\u0E2A\\u0E40\\u0E15\\u0E2D\\u0E23\\u0E4C\\u0E22\\u0E39\\u0E44\\u0E19\\u0E40\\u0E15\\u0E47\\u0E14\\u0E04\\u0E37\\u0E2D\\u0E2D\\u0E30\\u0E44\\u0E23\" #@param {type:\"string\"}\n",
"question_th_2 = \"\\u0E2A\\u0E19\\u0E32\\u0E21\\u0E40\\u0E2B\\u0E22\\u0E49\\u0E32\\u0E02\\u0E2D\\u0E07\\u0E41\\u0E21\\u0E19\\u0E22\\u0E39\\u0E2D\\u0E22\\u0E39\\u0E48\\u0E17\\u0E35\\u0E48\\u0E44\\u0E2B\\u0E19\" #@param {type:\"string\"}\n",
"question_th_3 = \"\\u0E01\\u0E48\\u0E2D\\u0E19\\u0E1B\\u0E35 1902 \\u0E41\\u0E21\\u0E19\\u0E40\\u0E0A\\u0E2A\\u0E40\\u0E15\\u0E2D\\u0E23\\u0E4C\\u0E22\\u0E39\\u0E44\\u0E19\\u0E40\\u0E15\\u0E47\\u0E14\\u0E0A\\u0E37\\u0E48\\u0E2D\\u0E2D\\u0E30\\u0E44\\u0E23\" #@param {type:\"string\"}\n",
"questions_th =[question_th_1,question_th_2,question_th_3]\n",
"questions_en = [th2en_spm.translate(i) for i in questions_th]\n",
"\n",
"for i in range(len(questions_en)):\n",
" question_en = questions_en[i]\n",
" question_th = questions_th[i]\n",
" inputs = tokenizer(question_en, text_en, add_special_tokens=True, return_tensors=\"pt\")\n",
" input_ids = inputs[\"input_ids\"].tolist()[0]\n",
" text_tokens = tokenizer.convert_ids_to_tokens(input_ids)\n",
" answer_start_scores, answer_end_scores = model(**inputs)\n",
" answer_start = torch.argmax(answer_start_scores) # Get the most likely beginning of answer with the argmax of the score\n",
" answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score\n",
" answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))\n",
" # answer = f'{answer}.' if answer[-1]!='.' else answer\n",
" print(f\"Question: {question_th}\")\n",
" print(f\"Answer: {en2th_spm.translate(answer)} (from {answer_start} to {answer_end})\")"
],
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"text": [
"Question: แมนเชสเตอร์ยูไนเต็ดคืออะไร\n",
"Answer: สโมสรฟุตบอล (from 10 to 13)\n",
"Question: สนามเหย้าของแมนยูอยู่ที่ไหน\n",
"Answer: โอลด์ แทรฟฟอร์ดค่ะ (from 20 to 22)\n",
"Question: ก่อนปี 1902 แมนเชสเตอร์ยูไนเต็ดชื่ออะไร\n",
"Answer: เรดเดวิลค่ะ (from 47 to 50)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "D2ExOLGwtVRq"
},
"source": [
"## T5 Summarization"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SGjITDJVZA9q"
},
"source": [
"You can also try summarization by entering the text to summarize in `article_th`. Tune the max length of the summarization in the cell below."
]
},
{
"cell_type": "code",
"metadata": {
"id": "rvB4p84SHSoR"
},
"source": [
"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
"\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(\"t5-base\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"t5-base\")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "BnZGJ6X4HekL"
},
"source": [
"#text to summarize\n",
"article_th = '''บริษัท สรรพสินค้าเซ็นทรัล จำกัด หรือ ห้างสรรพสินค้าเซ็นทรัล เป็นห้างสรรพสินค้าเก่าแก่แห่งหนึ่งของประเทศไทย \n",
"ก่อตั้งโดย เตียง จิราธิวัฒน์ เริ่มดำเนินการเมื่อปี พ.ศ. 2490 ด้วยการเป็นร้านขายของชำเล็กๆ \n",
"ก่อนที่สัมฤทธิ์ บุตรชาย จะเริ่มดำเนินการห้างสรรพสินค้าเซ็นทรัล สาขาแรกเมื่อ พ.ศ. 2499 ที่ย่านวังบูรพา \n",
"และกลายมาเป็นห้างสรรพสินค้าชั้นนำของไทยในปัจจุบัน\n",
"'''\n",
"article_en = th2en_spm.translate(article_th)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ZSS4mHKXtUTH",
"cellView": "form",
"outputId": "f1a56b4e-1198-4ce5-8de5-61e455920012",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 124
}
},
"source": [
"max_length = 200 #@param {type:\"slider\", min:150, max:300, step:50}\n",
"\n",
"# T5 uses a max_length of 512 so we cut the article to 512 tokens.\n",
"inputs = tokenizer.encode(\"summarize: \" + article_en, return_tensors=\"pt\", \n",
" max_length=512,truncation=True)\n",
"outputs = model.generate(inputs, max_length=max_length, min_length=50, \n",
" length_penalty=2.0, num_beams=4, early_stopping=True)\n",
"summary_en = [tokenizer.decode(i) for i in outputs][0]\n",
"summary_th = en2th_spm.translate(summary_en)\n",
"\n",
"print('Summary:')\n",
"textwrap.wrap(summary_th)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Summary:\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['เซ็นทรัล โค้ท (Central Co. หรือ Central)',\n",
" 'เป็นหนึ่งในห้างสรรพสินค้าที่เก่าแก่ที่สุดของประเทศไทย',\n",
" 'ก่อตั้งโดยเตียงจิราตีวัตต์ (Jirathiwatt) ซึ่งเริ่มมีร้านค้าขนาดเล็ก',\n",
" 'ในปี พ.ศ. 2490 ลูกชายเริ่มดําเนินการขายห้างสรรพสินค้าแห่งแรกใน พ.ศ.',\n",
" '2499']"
]
},
"metadata": {
"tags": []
},
"execution_count": 82
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HP6glRK3JrmU"
},
"source": [
"## GPT-2 Text Generation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "p8UgFYX6ZKCE"
},
"source": [
"Enter a prompt and generate texts using GPT-2."
]
},
{
"cell_type": "code",
"metadata": {
"id": "q6hLinXZSmVv"
},
"source": [
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"gpt2\", pad_token_id=tokenizer.eos_token_id)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "l5Pg9bf6V--u",
"cellView": "form",
"outputId": "43c0045e-d529-4576-91c7-961b621d4548",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 319
}
},
"source": [
"#@title Generate Texts\n",
"prompt_th = \"กาลครั้งหนึ่งนานมาแล้ว \" #@param {type:\"string\"}\n",
"max_length = 300 #@param {type:\"slider\", min:250, max:500, step:50}\n",
"prompt_en = th2en_spm.translate(prompt_th)\n",
"inputs = tokenizer.encode(prompt_en, \n",
" add_special_tokens=False, \n",
" return_tensors=\"pt\")\n",
"prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, \n",
" clean_up_tokenization_spaces=True))\n",
"outputs = model.generate(inputs, max_length=max_length, do_sample=True, \n",
" top_p=0.95, top_k=60)\n",
"generated = prompt_th + en2th_spm.translate(tokenizer.decode(outputs[0])[prompt_length:])\n",
"textwrap.wrap(generated)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['กาลครั้งหนึ่งนานมาแล้ว เหตุผลที่ฉันคิดว่ามันสําคัญมากสําหรับบล็อกนี้ที',\n",
" '่จะสะท้อนให้เห็นถึงข้อเท็จจริงที่เรียบง่ายของชีวิตและสถานการณ์ชีวิตแทน',\n",
" 'ที่จะเป็นความคิดที่เป็นนามธรรมว่าชีวิตเป็นอย่างไรตอนนี้เป็นเพราะพวกเขา',\n",
" 'เป็นเรื่องราวของชีวิตอย่างที่มันทํา, เวลาที่สมองของเราทํางานเพื่อให้รู',\n",
" '้สึกถึงทุกสิ่งที่เราประสบและเพื่อนําทางเราในทางที่มีความหมายบางอย่าง',\n",
" 'ชีวิตไม่เพียง แต่แตกต่างกันเท่านั้น, มันซับซ้อน,',\n",
" 'และบางครั้งคุณใช้ชีวิตและกิจกรรมของสมองในชีวิตซึ่งอารมณ์อื่น ๆ ทั้งหมด',\n",
" '- อารมณ์ของความเจ็บปวด, ความทุกข์ทรมาน, ความหงุดหงิด, ความเครียด -',\n",
" 'เล่นบทบาทขนาดใหญ่. บางครั้งเรามีประสบการณ์ชีวิตเล็กน้อยเมื่อเราอาศัยอย',\n",
" 'ู่ในช่วงเวลาที่ยากลําบากที่สุดเพราะเหตุการณ์ที่น่าเศร้าบางอย่าง',\n",
" '(เช่นอุบัติเหตุรถชนหรือความตายแม้ว่าเรามีประสบการณ์เหมือนกัน)',\n",
" 'หรือแม้ว่าเราจะไม่มีแผนที่ถูกต้องในการจัดการกับสถานการณ์เหล่านั้น. หรื',\n",
" 'อบางทีเราอาจมีความเจ็บป่วยทางจิตที่ทําให้เราตกใจอย่างถาวรหรือแม้กระทั่',\n",
" 'งความเศร้าโศกและมันเป็นเหตุผลที่เรามีความพิการทางจิตหรือหนึ่งในความผิด',\n",
" 'ปกติที่เราเคยอยู่ในอารมณ์ที่ดี.',\n",
" 'เรามักจะพบว่ามันยากที่จะมุ่งเน้นที่ทั้งหมด,',\n",
" 'เพื่อให้คิดต่อไปว่าทั้งหมดนี้จะพาเรา']"
]
},
"metadata": {
"tags": []
},
"execution_count": 77
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "uohWti2YW3iT"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment