neoyipeng2018 · September 6, 2020 01:31
diff --git a/finetuningwithtrainer.ipynb b/finetuningwithtrainer.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "FineTuningWith 🤗 Trainer.ipynb",
      "provenance": [],
      "collapsed_sections": [
        "3slpjqSp8zsZ",
        "4l-hhP-GaX_j"
      ],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.4"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "c6a73993a49c432ba584ec488247c016": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_2a083ff521a749e2b684c6bd3005bc13",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_bbcace5b2216435babe5d2e758894022",
              "IPY_MODEL_637a2c14acab4d84ad9fd9fcd2a110f8"
            ]
          }
        },
        "2a083ff521a749e2b684c6bd3005bc13": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "bbcace5b2216435babe5d2e758894022": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_1cd5520bc2164cb5aa4e9722ac664438",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 411,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 411,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_224b1f677b2c496bbf3037952287207d"
          }
        },
        "637a2c14acab4d84ad9fd9fcd2a110f8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_da0e168573b74ae79336994546cee36d",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 411/411 [00:00&lt;00:00, 615B/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_9312c749c6cd419f90f15fd8fab2bdb2"
          }
        },
        "1cd5520bc2164cb5aa4e9722ac664438": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "224b1f677b2c496bbf3037952287207d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "da0e168573b74ae79336994546cee36d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "9312c749c6cd419f90f15fd8fab2bdb2": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "f7721aa245fd40fcbaecf1fbc6855a39": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_4754b43cca614df7af16afb508c99b8a",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_3d3e2d7609454dae98ec0a85f4e219ea",
              "IPY_MODEL_b0e7c0e8454e48698b9aff2271a59683"
            ]
          }
        },
        "4754b43cca614df7af16afb508c99b8a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "3d3e2d7609454dae98ec0a85f4e219ea": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_ab1af4531e154c5ab80e940e5c8e3746",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 213450,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 213450,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_70b83f9da5744ed0a91d5c9967f6cb7b"
          }
        },
        "b0e7c0e8454e48698b9aff2271a59683": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_5960e702f1724327942731b84f3ced8f",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 213k/213k [00:00&lt;00:00, 853kB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_2cc334cd9fc04419bcaeaec4f5a0f328"
          }
        },
        "ab1af4531e154c5ab80e940e5c8e3746": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "70b83f9da5744ed0a91d5c9967f6cb7b": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "5960e702f1724327942731b84f3ced8f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "2cc334cd9fc04419bcaeaec4f5a0f328": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "f3ab3e82cd7b4a85b530576a7e26cde2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "state": {
            "_view_name": "HBoxView",
            "_dom_classes": [],
            "_model_name": "HBoxModel",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "box_style": "",
            "layout": "IPY_MODEL_a060bdc45a754f6aae21badd838475a5",
            "_model_module": "@jupyter-widgets/controls",
            "children": [
              "IPY_MODEL_532d0411a8a14b27b426663a4e875bf5",
              "IPY_MODEL_4aa71e4105804cc889cd99a88a478b3b"
            ]
          }
        },
        "a060bdc45a754f6aae21badd838475a5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "532d0411a8a14b27b426663a4e875bf5": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "state": {
            "_view_name": "ProgressView",
            "style": "IPY_MODEL_3c212742117f4a27aaca29fcde46c709",
            "_dom_classes": [],
            "description": "Downloading: 100%",
            "_model_name": "FloatProgressModel",
            "bar_style": "success",
            "max": 263273408,
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": 263273408,
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "orientation": "horizontal",
            "min": 0,
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_8b3db1af7ec844079405c2a80c88f813"
          }
        },
        "4aa71e4105804cc889cd99a88a478b3b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "state": {
            "_view_name": "HTMLView",
            "style": "IPY_MODEL_45b546123fec4d64b6e3c55991f68a7f",
            "_dom_classes": [],
            "description": "",
            "_model_name": "HTMLModel",
            "placeholder": "",
            "_view_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "value": " 263M/263M [00:04&lt;00:00, 56.0MB/s]",
            "_view_count": null,
            "_view_module_version": "1.5.0",
            "description_tooltip": null,
            "_model_module": "@jupyter-widgets/controls",
            "layout": "IPY_MODEL_793c1c8f87ab4138a58d194172632fca"
          }
        },
        "3c212742117f4a27aaca29fcde46c709": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "ProgressStyleModel",
            "description_width": "initial",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "bar_color": null,
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "8b3db1af7ec844079405c2a80c88f813": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        },
        "45b546123fec4d64b6e3c55991f68a7f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_view_name": "StyleView",
            "_model_name": "DescriptionStyleModel",
            "description_width": "",
            "_view_module": "@jupyter-widgets/base",
            "_model_module_version": "1.5.0",
            "_view_count": null,
            "_view_module_version": "1.2.0",
            "_model_module": "@jupyter-widgets/controls"
          }
        },
        "793c1c8f87ab4138a58d194172632fca": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "state": {
            "_view_name": "LayoutView",
            "grid_template_rows": null,
            "right": null,
            "justify_content": null,
            "_view_module": "@jupyter-widgets/base",
            "overflow": null,
            "_model_module_version": "1.2.0",
            "_view_count": null,
            "flex_flow": null,
            "width": null,
            "min_width": null,
            "border": null,
            "align_items": null,
            "bottom": null,
            "_model_module": "@jupyter-widgets/base",
            "top": null,
            "grid_column": null,
            "overflow_y": null,
            "overflow_x": null,
            "grid_auto_flow": null,
            "grid_area": null,
            "grid_template_columns": null,
            "flex": null,
            "_model_name": "LayoutModel",
            "justify_items": null,
            "grid_row": null,
            "max_height": null,
            "align_content": null,
            "visibility": null,
            "align_self": null,
            "height": null,
            "min_height": null,
            "padding": null,
            "grid_auto_rows": null,
            "grid_gap": null,
            "max_width": null,
            "order": null,
            "_view_module_version": "1.2.0",
            "grid_template_areas": null,
            "object_position": null,
            "object_fit": null,
            "grid_auto_columns": null,
            "margin": null,
            "display": null,
            "left": null
          }
        }
      }
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/neoyipeng2018/cbe64b40ad683ff50b15fb67fa39fabd/finetuningwithtrainer.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BXmJ_IPxb8cn",
        "colab_type": "text"
      },
      "source": [
        "#Fine Tuning a Language Model using 🤗's Trainer\n",
        "In some recent NLP competitions I entered, I noticed the huge benefits of fine tuning a language model before starting to further fine tune for downstream tasks.\n",
        "\n",
        "Transformers library has a `Trainer` module which has an end to end train/evaluation loop to fine-tune a transformer model. There are some notebooks/guides but I was looking for a simple example that contains all the basic needs like setting up the dataset, evaluation metrics, tensorboards etc, but couldn't really find one, so I decided to create a one that contains everything I needed to start.\n",
        "\n",
        "**References**:\n",
        "1. https://zablo.net/blog/post/training-roberta-from-scratch-the-missing-guide-polish-language-model/\n",
        "1. https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=GlvP_A-THEEl\n",
        "1. https://skimai.com/roberta-language-model-for-spanish/"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-skY1JbAbEQ6",
        "colab_type": "text"
      },
      "source": [
        "## Installing stuffs"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3pPYShGjTAxW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!pip install transformers\n",
        "!pip install tokenizers\n",
        "!pip install tensorboard"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bZO302pF7mig",
        "colab_type": "text"
      },
      "source": [
        "## Ingredients for Langugage Modelling\n",
        "1. Model\n",
        "1. Dataset\n",
        "1. Trainer"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3slpjqSp8zsZ",
        "colab_type": "text"
      },
      "source": [
        "### 1. Initialise Model and Tokenizer\n",
        "This is pretty straightforward thanks to 🤗. You essentially just need to choose a model name from https://huggingface.co/models"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "OD71y3mYZ-tP",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 182,
          "referenced_widgets": [
            "c6a73993a49c432ba584ec488247c016",
            "2a083ff521a749e2b684c6bd3005bc13",
            "bbcace5b2216435babe5d2e758894022",
            "637a2c14acab4d84ad9fd9fcd2a110f8",
            "1cd5520bc2164cb5aa4e9722ac664438",
            "224b1f677b2c496bbf3037952287207d",
            "da0e168573b74ae79336994546cee36d",
            "9312c749c6cd419f90f15fd8fab2bdb2",
            "f7721aa245fd40fcbaecf1fbc6855a39",
            "4754b43cca614df7af16afb508c99b8a",
            "3d3e2d7609454dae98ec0a85f4e219ea",
            "b0e7c0e8454e48698b9aff2271a59683",
            "ab1af4531e154c5ab80e940e5c8e3746",
            "70b83f9da5744ed0a91d5c9967f6cb7b",
            "5960e702f1724327942731b84f3ced8f",
            "2cc334cd9fc04419bcaeaec4f5a0f328",
            "f3ab3e82cd7b4a85b530576a7e26cde2",
            "a060bdc45a754f6aae21badd838475a5",
            "532d0411a8a14b27b426663a4e875bf5",
            "4aa71e4105804cc889cd99a88a478b3b",
            "3c212742117f4a27aaca29fcde46c709",
            "8b3db1af7ec844079405c2a80c88f813",
            "45b546123fec4d64b6e3c55991f68a7f",
            "793c1c8f87ab4138a58d194172632fca"
          ]
        },
        "outputId": "b04d0c84-7a05-4ad5-ddc3-df92efe17a00"
      },
      "source": [
        "from transformers import AutoConfig,AutoTokenizer,AutoModelForPreTraining\n",
        "\n",
        "modelnm='distilbert-base-cased'\n",
        "tokenizer=AutoTokenizer.from_pretrained(modelnm)\n",
        "model=AutoModelForPreTraining.from_pretrained(modelnm)\n",
        "\n",
        "#test tokenizer\n",
        "tokenizer.tokenize('extravagant')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "c6a73993a49c432ba584ec488247c016",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "f7721aa245fd40fcbaecf1fbc6855a39",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "f3ab3e82cd7b4a85b530576a7e26cde2",
              "version_minor": 0,
              "version_major": 2
            },
            "text/plain": [
              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…"
            ]
          },
          "metadata": {
            "tags": []
          }
        },
        {
          "output_type": "stream",
          "text": [
            "\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "['extra', '##va', '##gant']"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 2
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VOBqfWOn1zBQ",
        "colab_type": "text"
      },
      "source": [
        "### 2. Creating train and evaluation dataset\n",
        "Quite often, my datasets are in a .txt or .csv. There are 2 ways to create a dataset in huggingface from text files:\n",
        "\n",
        "1. **LineByLineTextDataset**: Assumes each line is a document, and tokenizer will only run once on each line, hence documents that are longer than the block size will be truncated.\n",
        "2. **TextDataset**: Assumes the documents are one big corpus, and splits using block size. There won't be any padding though.\n",
        "\n",
        "For this use case, I'll just grab a Amazon Review from https://nijianmo.github.io/amazon/index.html"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-CfqLPJVYEKS",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 208
        },
        "outputId": "5e2acf1e-2118-415e-9f10-dca1f04d5601"
      },
      "source": [
        "!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/AMAZON_FASHION_5.json.gz"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2020-09-03 14:41:51--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/AMAZON_FASHION_5.json.gz\n",
            "Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50\n",
            "Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 287013 (280K) [application/octet-stream]\n",
            "Saving to: ‘AMAZON_FASHION_5.json.gz’\n",
            "\n",
            "AMAZON_FASHION_5.js 100%[===================>] 280.29K  --.-KB/s    in 0.1s    \n",
            "\n",
            "2020-09-03 14:41:51 (2.22 MB/s) - ‘AMAZON_FASHION_5.json.gz’ saved [287013/287013]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kyFpWUyvXMwq",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "cca1794f-bbe0-4b13-f7eb-18a9a2e7bdf7"
      },
      "source": [
        "import gzip, json, pandas as pd\n",
        "\n",
        "data = []\n",
        "with gzip.open('AMAZON_FASHION_5.json.gz') as f:\n",
        "    for l in f:\n",
        "        data.append(json.loads(l.strip()))\n",
        "df = pd.DataFrame.from_dict(data)\n",
        "\n",
        "print(len(df))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "3176\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "F_ZejltASXU-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "df[['reviewText']][:3000].to_csv('train.csv')\n",
        "df[['reviewText']][3000:].to_csv('val.csv')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "C3QVwSGWUc4z",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from transformers import LineByLineTextDataset,DataCollatorForLanguageModeling,TextDataset"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "087O5Kb6M7Bj",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "train_dataset = TextDataset(\n",
        "    tokenizer=tokenizer,\n",
        "    file_path='train.csv',\n",
        "    block_size=128\n",
        ")\n",
        "\n",
        "val_dataset = TextDataset(\n",
        "    tokenizer=tokenizer,\n",
        "    file_path='val.csv',\n",
        "    block_size=128\n",
        ")\n",
        "\n",
        "# LineBYLine\n",
        "# train_dataset = LineByLineTextDataset(\n",
        "#     tokenizer=tokenizer,\n",
        "#     file_path=\"train.csv\",\n",
        "#     block_size=512\n",
        "# )\n",
        "\n",
        "# val_dataset = LineByLineTextDataset(\n",
        "#     tokenizer=tokenizer,\n",
        "#     file_path=\"val.csv\",\n",
        "#     block_size=512\n",
        "# )\n",
        "# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=True,mlm_probability=0.15)\n",
        "\n",
        "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=True,mlm_probability=0.15)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Z7U1QxGZ77Zk",
        "colab_type": "text"
      },
      "source": [
        "### 3. Trainer Arguments and Trainer\n",
        "We initialise the training arguments and also Trainer"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_A8OQiZ1UBrG",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 364
        },
        "outputId": "d583e58a-6bd5-4307-a4bf-ef20f01f15c4"
      },
      "source": [
        "# Check that we have a GPU\n",
        "!nvidia-smi"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Thu Sep  3 14:43:04 2020       \n",
            "+-----------------------------------------------------------------------------+\n",
            "| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |\n",
            "|-------------------------------+----------------------+----------------------+\n",
            "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
            "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
            "|                               |                      |               MIG M. |\n",
            "|===============================+======================+======================|\n",
            "|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |\n",
            "| N/A   42C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |\n",
            "|                               |                      |                 ERR! |\n",
            "+-------------------------------+----------------------+----------------------+\n",
            "                                                                               \n",
            "+-----------------------------------------------------------------------------+\n",
            "| Processes:                                                                  |\n",
            "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
            "|        ID   ID                                                   Usage      |\n",
            "|=============================================================================|\n",
            "|  No running processes found                                                 |\n",
            "+-----------------------------------------------------------------------------+\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "z424e05WUFrz",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "d913f282-b185-4f87-eeb8-29534e537988"
      },
      "source": [
        "import torch\n",
        "torch.cuda.is_available()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Gm-zq6m5T8zC",
        "colab_type": "text"
      },
      "source": [
        "### Using Trainer\n",
        "- For details on training arguments: https://huggingface.co/transformers/main_classes/trainer.html\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SvTNFlOVi5xA",
        "colab_type": "text"
      },
      "source": [
        "To get the Trainer to compute accuracy for our Masked Language Modelling, we create these 2 functions"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "e57i8G6AEXvb",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def accuracy(input, targs):\n",
        "    \"Computes accuracy with `targs` when `input` is bs * n_classes.\"\n",
        "    n = targs.shape[0]\n",
        "    #input = input.argmax(dim=-1).view(n,-1)\n",
        "    input = input.reshape(n,-1)\n",
        "    targs = targs.reshape(n,-1)\n",
        "    # return (input==targs).float().mean()\n",
        "    return (input==targs).mean()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GexxpXt3EX7e",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def compute_metrics(pred):\n",
        "    labels = pred.label_ids\n",
        "    preds = pred.predictions.argmax(-1)\n",
        "    acc = accuracy(labels, preds)\n",
        "    return {\n",
        "        'accuracy': acc\n",
        "    }"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "F33spPoZjCfn",
        "colab_type": "text"
      },
      "source": [
        "Start tensorboard writer"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HE4qoiWpOxMK",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from torch.utils.tensorboard import SummaryWriter\n",
        "tb = SummaryWriter()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wjeKhkrGjJhm",
        "colab_type": "text"
      },
      "source": [
        "Define training arguments and trainer"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BPgFNONpEYAW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "from transformers import Trainer, TrainingArguments\n",
        "\n",
        "training_args = TrainingArguments(\n",
        "    output_dir=\"./model\",\n",
        "    overwrite_output_dir=True,\n",
        "    num_train_epochs=3,\n",
        "    per_device_train_batch_size=8,\n",
        "    per_device_eval_batch_size=8,\n",
        "    save_steps=1000,\n",
        "    save_total_limit=1,\n",
        "    learning_rate=5e-5, #In BERT, fine tuning lrs were in the range of (2e-5 to 5e-5)\n",
        "    do_train=True,\n",
        "    evaluate_during_training=True,\n",
        "    # warmup_steps=1000 #In BERT, pre-training phase had 10k warmup\n",
        "    logging_steps=1,\n",
        "    eval_steps=1,\n",
        "    gradient_accumulation_steps=8, #reduce memory usage while allowing bigger overall batch size. Roberta used this technique to get a 8k batch size.s\n",
        ")\n",
        "\n",
        "trainer = Trainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    data_collator=data_collator,\n",
        "    train_dataset=train_dataset,\n",
        "    eval_dataset=val_dataset,\n",
        "    compute_metrics=compute_metrics,\n",
        "    tb_writer=tb\n",
        ")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2R5zX616jN2r",
        "colab_type": "text"
      },
      "source": [
        "Train"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "JgBw1Ap8EYFF",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "%%time\n",
        "trainer.train()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "SFy2fG0cJ51H",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 766
        },
        "outputId": "dc59a426-2c89-42c5-aa87-e879f34fa541"
      },
      "source": [
        "!tensorboard dev upload --logdir runs"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "2020-09-03 14:45:50.966465: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1\n",
            "\n",
            "***** TensorBoard Uploader *****\n",
            "\n",
            "This will upload your TensorBoard logs to https://tensorboard.dev/ from\n",
            "the following directory:\n",
            "\n",
            "runs\n",
            "\n",
            "This TensorBoard will be visible to everyone. Do not upload sensitive\n",
            "data.\n",
            "\n",
            "Your use of this service is subject to Google's Terms of Service\n",
            "<https://policies.google.com/terms> and Privacy Policy\n",
            "<https://policies.google.com/privacy>, and TensorBoard.dev's Terms of Service\n",
            "<https://tensorboard.dev/policy/terms/>.\n",
            "\n",
            "This notice will not be shown again while you are logged into the uploader.\n",
            "To log out, run `tensorboard dev auth revoke`.\n",
            "\n",
            "Continue? (yes/NO) yes\n",
            "\n",
            "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=373649185512-8v619h5kft38l4456nm2dj4ubeqsrvh6.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email&state=DySudJM7iN3tEjewfPCdKAL8frAksT&prompt=consent&access_type=offline\n",
            "Enter the authorization code: 4/3gE8vDdu5I96mSAvBzuMa6vE6QTItHBxh_2SIxwJISxTuadWuk4O3MU\n",
            "\n",
            "Data for the \"graphs\" plugin is now uploaded to TensorBoard.dev! Note that uploaded data is public. If you do not want to upload data for this plugin, use the \"--plugins\" command line argument.\n",
            "Data for the \"histograms\" plugin is now uploaded to TensorBoard.dev! Note that uploaded data is public. If you do not want to upload data for this plugin, use the \"--plugins\" command line argument.\n",
            "Data for the \"hparams\" plugin is now uploaded to TensorBoard.dev! Note that uploaded data is public. If you do not want to upload data for this plugin, use the \"--plugins\" command line argument.\n",
            "Upload started and will continue reading any new data as it's added\n",
            "to the logdir. To stop uploading, press Ctrl-C.\n",
            "\n",
            "View your TensorBoard live at: https://tensorboard.dev/experiment/DIl7t4EkQym5kGXEXO4qTw/\n",
            "\n",
            "\u001b[1m[2020-09-03T14:46:10]\u001b[0m Uploader started.\n",
            "\u001b[1m[2020-09-03T14:46:10]\u001b[0m Total uploaded: 216 scalars, 3 tensors (18 B), 0 binary objects\n",
            "\n",
            "Interrupted. View your TensorBoard at https://tensorboard.dev/experiment/DIl7t4EkQym5kGXEXO4qTw/\n",
            "Exception ignored in: <bound method Channel.__del__ of <grpc._channel.Channel object at 0x7fc9ed7cd3c8>>\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/local/lib/python3.6/dist-packages/grpc/_channel.py\", line 1446, in __del__\n",
            "    def __del__(self):\n",
            "KeyboardInterrupt\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_8EMioIuiECl",
        "colab_type": "text"
      },
      "source": [
        "Save"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2y4E9Y36jQyT",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "trainer.save_model(\"./model\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4l-hhP-GaX_j",
        "colab_type": "text"
      },
      "source": [
        "# Conclusion\n",
        "We've initialised a transformer model, loaded a dataset and fine-tuned a pre-trained language model for a few epochs and visualised the valiation loss and accuracy in tensorboard."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CpIunQmZYcg_",
        "colab_type": "text"
      },
      "source": [
        "Next: Explore hyperparameter search\n",
        "1. https://huggingface.co/transformers/master/main_classes/trainer.html#transformers.Trainer.hyperparameter_search\n",
        "1. https://discuss.huggingface.co/t/using-hyperparameter-search-in-trainer/785/10"
      ]
    }
  ]
 }