raven44099 · March 10, 2023 06:29
diff --git a/biogpt_230310.ipynb b/biogpt_230310.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "machine_shape": "hm",
      "collapsed_sections": [
        "rsEKEEhMAKxJ",
        "kOoIgoLRcfce",
        "1Ss8zW9jeNt7"
      ],
      "mount_file_id": "1hVzHfVzRq14XUfWECEf_btm-g1x6HpfJ",
      "authorship_tag": "ABX9TyPIVjv0390Z4e6YV9O6yPJC",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU",
    "gpuClass": "standard"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/raven44099/edd254c6f5dbcfe5faad7701d1df88cf/biogpt_230310.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## start\n",
        "https://github.com/microsoft/BioGPT"
      ],
      "metadata": {
        "id": "HoJ61YPQ_IxB"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 36
        },
        "id": "em4vMY2N_BmE",
        "outputId": "c307a117-1b15-4b38-93fb-91000b07aa1a"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'1.13.1+cu116'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 1
        }
      ],
      "source": [
        "PWD = '/content'\n",
        "import torch\n",
        "torch.__version__"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# !pip install fairseq\n",
        "\n",
        "!git clone https://github.com/pytorch/fairseq\n",
        "%cd fairseq\n",
        "!git checkout v0.12.0\n",
        "!pip install .\n",
        "!python setup.py build_ext --inplace\n",
        "%cd .."
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lQ7hdoaD_UUI",
        "outputId": "1f3a6fe3-821b-4e59-c1a7-d1470f614c5d"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'fairseq'...\n",
            "remote: Enumerating objects: 34534, done.\u001b[K\n",
            "remote: Total 34534 (delta 0), reused 0 (delta 0), pack-reused 34534\u001b[K\n",
            "Receiving objects: 100% (34534/34534), 24.06 MiB | 15.70 MiB/s, done.\n",
            "Resolving deltas: 100% (25109/25109), done.\n",
            "/content/fairseq\n",
            "Note: switching to 'v0.12.0'.\n",
            "\n",
            "You are in 'detached HEAD' state. You can look around, make experimental\n",
            "changes and commit them, and you can discard any commits you make in this\n",
            "state without impacting any branches by switching back to a branch.\n",
            "\n",
            "If you want to create a new branch to retain commits you create, you may\n",
            "do so (now or later) by using -c with the switch command. Example:\n",
            "\n",
            "  git switch -c <new-branch-name>\n",
            "\n",
            "Or undo this operation with:\n",
            "\n",
            "  git switch -\n",
            "\n",
            "Turn off this advice by setting config variable advice.detachedHead to false\n",
            "\n",
            "HEAD is now at 6795311b 0.12.0 release\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Processing /content/fairseq\n",
            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: cffi in /usr/local/lib/python3.9/dist-packages (from fairseq==0.12.0) (1.15.1)\n",
            "Collecting sacrebleu>=1.4.12\n",
            "  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m118.9/118.9 KB\u001b[0m \u001b[31m357.3 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: torchaudio>=0.8.0 in /usr/local/lib/python3.9/dist-packages (from fairseq==0.12.0) (0.13.1+cu116)\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.9/dist-packages (from fairseq==0.12.0) (1.13.1+cu116)\n",
            "Collecting hydra-core<1.1,>=1.0.7\n",
            "  Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m123.8/123.8 KB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting bitarray\n",
            "  Downloading bitarray-2.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (269 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m269.6/269.6 KB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting omegaconf<2.1\n",
            "  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from fairseq==0.12.0) (1.22.4)\n",
            "Requirement already satisfied: cython in /usr/local/lib/python3.9/dist-packages (from fairseq==0.12.0) (0.29.33)\n",
            "Requirement already satisfied: regex in /usr/local/lib/python3.9/dist-packages (from fairseq==0.12.0) (2022.6.2)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from fairseq==0.12.0) (4.65.0)\n",
            "Collecting antlr4-python3-runtime==4.8\n",
            "  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.4/112.4 KB\u001b[0m \u001b[31m16.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from omegaconf<2.1->fairseq==0.12.0) (4.5.0)\n",
            "Requirement already satisfied: PyYAML>=5.1.* in /usr/local/lib/python3.9/dist-packages (from omegaconf<2.1->fairseq==0.12.0) (6.0)\n",
            "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.9/dist-packages (from sacrebleu>=1.4.12->fairseq==0.12.0) (0.8.10)\n",
            "Collecting colorama\n",
            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
            "Collecting portalocker\n",
            "  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)\n",
            "Requirement already satisfied: lxml in /usr/local/lib/python3.9/dist-packages (from sacrebleu>=1.4.12->fairseq==0.12.0) (4.9.2)\n",
            "Requirement already satisfied: pycparser in /usr/local/lib/python3.9/dist-packages (from cffi->fairseq==0.12.0) (2.21)\n",
            "Building wheels for collected packages: fairseq, antlr4-python3-runtime\n",
            "  Building wheel for fairseq (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for fairseq: filename=fairseq-0.12.0-cp39-cp39-linux_x86_64.whl size=19060959 sha256=ab34712e50d6b38b93abf651dcb4d9c34d941e768c262f2d9cb981c296afce72\n",
            "  Stored in directory: /tmp/pip-ephem-wheel-cache-rcr1zhq8/wheels/52/da/57/31b8a8f767e4d044de3fbb1f204d0f1547e8c6e0b171e56bba\n",
            "  Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141231 sha256=0d30c0581444dd19a1f6a3de41bb068abe7b1af6610bdb431bb25fed63022a4f\n",
            "  Stored in directory: /root/.cache/pip/wheels/42/3c/ae/14db087e6018de74810afe32eb6ac890ef9c68ba19b00db97a\n",
            "Successfully built fairseq antlr4-python3-runtime\n",
            "Installing collected packages: bitarray, antlr4-python3-runtime, portalocker, omegaconf, colorama, sacrebleu, hydra-core, fairseq\n",
            "Successfully installed antlr4-python3-runtime-4.8 bitarray-2.7.3 colorama-0.4.6 fairseq-0.12.0 hydra-core-1.0.7 omegaconf-2.0.6 portalocker-2.7.0 sacrebleu-2.3.1\n",
            "running build_ext\n",
            "/usr/local/lib/python3.9/dist-packages/torch/utils/cpp_extension.py:476: UserWarning: Attempted to use ninja as the BuildExtension backend but we could not find ninja.. Falling back to using the slow distutils backend.\n",
            "  warnings.warn(msg.format('we could not find ninja.'))\n",
            "skipping 'fairseq/data/data_utils_fast.cpp' Cython extension (up-to-date)\n",
            "skipping 'fairseq/data/token_block_utils_fast.cpp' Cython extension (up-to-date)\n",
            "copying build/lib.linux-x86_64-3.9/fairseq/libbleu.cpython-39-x86_64-linux-gnu.so -> fairseq\n",
            "copying build/lib.linux-x86_64-3.9/fairseq/data/data_utils_fast.cpython-39-x86_64-linux-gnu.so -> fairseq/data\n",
            "copying build/lib.linux-x86_64-3.9/fairseq/data/token_block_utils_fast.cpython-39-x86_64-linux-gnu.so -> fairseq/data\n",
            "copying build/lib.linux-x86_64-3.9/fairseq/libbase.cpython-39-x86_64-linux-gnu.so -> fairseq\n",
            "copying build/lib.linux-x86_64-3.9/fairseq/libnat.cpython-39-x86_64-linux-gnu.so -> fairseq\n",
            "copying build/lib.linux-x86_64-3.9/alignment_train_cpu_binding.cpython-39-x86_64-linux-gnu.so -> \n",
            "/content\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content\n",
        "!git clone https://github.com/microsoft/BioGPT.git"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fVThQat1DQIJ",
        "outputId": "0e5872c6-7497-4ab8-a808-4599f5520519"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content\n",
            "Cloning into 'BioGPT'...\n",
            "remote: Enumerating objects: 341, done.\u001b[K\n",
            "remote: Counting objects: 100% (70/70), done.\u001b[K\n",
            "remote: Compressing objects: 100% (33/33), done.\u001b[K\n",
            "remote: Total 341 (delta 56), reused 37 (delta 37), pack-reused 271\u001b[K\n",
            "Receiving objects: 100% (341/341), 31.44 MiB | 19.18 MiB/s, done.\n",
            "Resolving deltas: 100% (175/175), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#@title prepare package Moses\n",
        "%cd /content/BioGPT\n",
        "!git clone https://github.com/moses-smt/mosesdecoder.git\n",
        "!export MOSES=${PWD}/mosesdecoder"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Jy4XY-zwAesc",
        "outputId": "6bc50854-7304-44b1-c138-9d1166fe86ca"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n",
            "Cloning into 'mosesdecoder'...\n",
            "remote: Enumerating objects: 148097, done.\u001b[K\n",
            "remote: Counting objects: 100% (525/525), done.\u001b[K\n",
            "remote: Compressing objects: 100% (229/229), done.\u001b[K\n",
            "remote: Total 148097 (delta 323), reused 441 (delta 292), pack-reused 147572\u001b[K\n",
            "Receiving objects: 100% (148097/148097), 129.88 MiB | 20.56 MiB/s, done.\n",
            "Resolving deltas: 100% (114349/114349), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#@title setup package fasBPE\n",
        "%cd /content/BioGPT\n",
        "!git clone https://github.com/glample/fastBPE.git\n",
        "!export FASTBPE=${PWD}/fastBPE\n",
        "%cd fastBPE\n",
        "!g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XEfxn6icAnYX",
        "outputId": "c227a5c0-c026-4cb5-c5e4-d0dc55978ff7"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n",
            "Cloning into 'fastBPE'...\n",
            "remote: Enumerating objects: 59, done.\u001b[K\n",
            "remote: Total 59 (delta 0), reused 0 (delta 0), pack-reused 59\u001b[K\n",
            "Unpacking objects: 100% (59/59), 29.97 KiB | 2.00 MiB/s, done.\n",
            "/content/BioGPT/fastBPE\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd ..\n",
        "!pip install sacremoses\n",
        "!pip install scikit-learn"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "whLrxejtBmWg",
        "outputId": "6119237b-88b6-4e70-821f-7372ab30466a"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting sacremoses\n",
            "  Downloading sacremoses-0.0.53.tar.gz (880 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m880.6/880.6 KB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Requirement already satisfied: regex in /usr/local/lib/python3.9/dist-packages (from sacremoses) (2022.6.2)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.9/dist-packages (from sacremoses) (1.15.0)\n",
            "Requirement already satisfied: click in /usr/local/lib/python3.9/dist-packages (from sacremoses) (8.1.3)\n",
            "Requirement already satisfied: joblib in /usr/local/lib/python3.9/dist-packages (from sacremoses) (1.2.0)\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.9/dist-packages (from sacremoses) (4.65.0)\n",
            "Building wheels for collected packages: sacremoses\n",
            "  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=83487e24f44c77c1ec59df8d82e2ea3d92b2b4d37847cb4319a8eb844a64d7b8\n",
            "  Stored in directory: /root/.cache/pip/wheels/12/1c/3d/46cf06718d63a32ff798a89594b61e7f345ab6b36d909ce033\n",
            "Successfully built sacremoses\n",
            "Installing collected packages: sacremoses\n",
            "Successfully installed sacremoses-0.0.53\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.9/dist-packages (1.2.1)\n",
            "Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (1.22.4)\n",
            "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (3.1.0)\n",
            "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (1.2.0)\n",
            "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.9/dist-packages (from scikit-learn) (1.10.1)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import sacremoses\n",
        "import sklearn"
      ],
      "metadata": {
        "id": "VmEZwSLyHANi"
      },
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## KD-DTI dataset stuff\n",
        "more difficult preprocessing, just skip this section and use \"_RE-DDI_\" instead."
      ],
      "metadata": {
        "id": "rsEKEEhMAKxJ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Download them and extract them to the checkpoints folder of this project.\n",
        "%cd /content/BioGPT\n",
        "!mkdir checkpoints\n",
        "%cd checkpoints\n",
        "# !wget https://msramllasc.blob.core.windows.net/modelrelease/BioGPT/checkpoints/Pre-trained-BioGPT.tgz\n",
        "'''RE-DTI-BioGPT.tgz = 3.7G.'''\n",
        "# !wget https://msramllasc.blob.core.windows.net/modelrelease/BioGPT/checkpoints/RE-DTI-BioGPT.tgz\n",
        "!cp /content/drive/MyDrive/work/public/language_model/RE-DTI-BioGPT.tgz /content/BioGPT/checkpoints/ \n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RmaF_L8eB4eI",
        "outputId": "d2356618-23f2-4bea-c793-b2e4d500d8ed"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n",
            "/content/BioGPT/checkpoints\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# !cp /content/checkpoints/RE-DTI-BioGPT.tgz /content/drive/MyDrive/work/public/language_model"
      ],
      "metadata": {
        "id": "jYCjLdOhKn50"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import time\n",
        "start_time = time.time()\n",
        "# !tar -zxvf Pre-trained-BioGPT.tgz\n",
        "!tar -zxvf /content/BioGPT/checkpoints/RE-DTI-BioGPT.tgz\n",
        "\n",
        "end_time = time.time()\n",
        "print(f'time was: {end_time - start_time} s')#.format(ent_time - start_time))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gx7qX2mAFT7s",
        "outputId": "99c8340b-883b-4631-81de-9d3afc4ee09c"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "RE-DTI-BioGPT/\n",
            "RE-DTI-BioGPT/checkpoint_avg.pt\n",
            "time was: 39.36225605010986 s\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# %cd /content/BioGPT/checkpoints\n",
        "%cd /content/BioGPT"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_JGH8HlaDdYp",
        "outputId": "d7548a80-f18f-4d7c-9814-842cd3daafa0"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content\n",
        "!git clone https://github.com/bert-nmt/BERT-DTI.git"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bl7ycuo76ina",
        "outputId": "b14488de-3afb-4c74-e200-78a391ff5197"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content\n",
            "Cloning into 'BERT-DTI'...\n",
            "remote: Enumerating objects: 290, done.\u001b[K\n",
            "remote: Counting objects: 100% (290/290), done.\u001b[K\n",
            "remote: Compressing objects: 100% (229/229), done.\u001b[K\n",
            "remote: Total 290 (delta 49), reused 269 (delta 38), pack-reused 0\u001b[K\n",
            "Receiving objects: 100% (290/290), 2.55 MiB | 2.08 MiB/s, done.\n",
            "Resolving deltas: 100% (49/49), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Need to prepare the dataset, but for the other two RE-datasets its much easier!\n",
        "Anyway, if you continue with the KD-DTI dataset, these steps must be executed -->             [https://github.com/bert-nmt/BERT-DTI](https://github.com/bert-nmt/BERT-DTI )\n",
        " "
      ],
      "metadata": {
        "id": "AGLYx3wM_owy"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 2.2. KD-DTI dataset stuff"
      ],
      "metadata": {
        "id": "kOoIgoLRcfce"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content/BERT-DTI\n",
        "!./utils/prepare_environment.sh"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gMVpYU3M6z5X",
        "outputId": "ee619bb1-6d6b-4aeb-c22a-b19afde2eeb4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BERT-DTI\n",
            "sudo: ./utils/prepare_environment.sh: command not found\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# !wget -O news.csv https://www.dropbox.com/s/XXXXXXX/news.csv?dl=0\n",
        "# %cd /content/BioGPT/data/KD-DTI\n",
        "# !wget https://www.dropbox.com/sh/0e52w6p7wiek9ab/AADn5_q2xsggDiOymmWt52Mta/all_data_for_submit.zip?dl=0\n",
        "!wget https://www.dropbox.com/sh/0e52w6p7wiek9ab/AADLr67YorV18sprmvlittpNa/meta_data_JSON_LD?dl=0"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iSRXN0VZ5lny",
        "outputId": "4cc5b142-47a4-4590-c747-f61dcb1b7764"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT/data/KD-DTI\n",
            "--2023-02-14 03:11:20--  https://www.dropbox.com/sh/0e52w6p7wiek9ab/AADn5_q2xsggDiOymmWt52Mta/all_data_for_submit.zip?dl=0\n",
            "Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6031:18::a27d:5112\n",
            "Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: /sh/raw/0e52w6p7wiek9ab/AADn5_q2xsggDiOymmWt52Mta/all_data_for_submit.zip [following]\n",
            "--2023-02-14 03:11:21--  https://www.dropbox.com/sh/raw/0e52w6p7wiek9ab/AADn5_q2xsggDiOymmWt52Mta/all_data_for_submit.zip\n",
            "Reusing existing connection to www.dropbox.com:443.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com/cd/0/inline/B2fj_tSfowwK4zxpk8OXSVSUYaWhKWxzW9Fyi_qu893Rq5GVaD7FijKK_QGDlbS745d_yT3ELq3Mm_Q04u-V48BvfSnhY7a3YJ7sKB6e6FyXihVLdHjYXo6dB-m4Et1_n8wHag3tbce3JywbXmER63UwF8ZiK-4RcQ2JyHetdH_rTQ/file# [following]\n",
            "--2023-02-14 03:11:21--  https://uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com/cd/0/inline/B2fj_tSfowwK4zxpk8OXSVSUYaWhKWxzW9Fyi_qu893Rq5GVaD7FijKK_QGDlbS745d_yT3ELq3Mm_Q04u-V48BvfSnhY7a3YJ7sKB6e6FyXihVLdHjYXo6dB-m4Et1_n8wHag3tbce3JywbXmER63UwF8ZiK-4RcQ2JyHetdH_rTQ/file\n",
            "Resolving uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com (uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com)... 162.125.81.15, 2620:100:6017:15::a27d:20f\n",
            "Connecting to uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com (uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com)|162.125.81.15|:443... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: /cd/0/inline2/B2f1Y0e7S1J-7doXMaBCM_tnPYIpT4ajqr6OFH3smovUir_9Ab7VMWfdcoosG5VaBpcrF6dU_-IEqhQQMdHuL7w2kwYMBfGlGTkjpTNgka_Es8U2FkIvny-cDe7wcQZ4D0tTZJ4PPORL5A5OBREz5ddD38BbpXWo4B9iJ_IN3N1o5eIZQLn7Fy1wEUUTSJeNtN-hYztrArEpwx645YHdhnWwdnfhGvbMfihPdP8Ta5KMEGOcJulLZM_n56QSbkQrHa4vkCXuZaALUah5l5ytW2Cmwdpn9SvMWD2GYybFM6WOPrh8diMuaPksy_mXV3o9SEtd54RNO9OdG9Adew2M2swBeZt__I8SSoo5R-58Nz5quebhM15wvypzt8PePprYdwHwYaCKxximhK_CmOc-eDJhktUBFNt9DkafGL0KcMJRBg/file [following]\n",
            "--2023-02-14 03:11:22--  https://uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com/cd/0/inline2/B2f1Y0e7S1J-7doXMaBCM_tnPYIpT4ajqr6OFH3smovUir_9Ab7VMWfdcoosG5VaBpcrF6dU_-IEqhQQMdHuL7w2kwYMBfGlGTkjpTNgka_Es8U2FkIvny-cDe7wcQZ4D0tTZJ4PPORL5A5OBREz5ddD38BbpXWo4B9iJ_IN3N1o5eIZQLn7Fy1wEUUTSJeNtN-hYztrArEpwx645YHdhnWwdnfhGvbMfihPdP8Ta5KMEGOcJulLZM_n56QSbkQrHa4vkCXuZaALUah5l5ytW2Cmwdpn9SvMWD2GYybFM6WOPrh8diMuaPksy_mXV3o9SEtd54RNO9OdG9Adew2M2swBeZt__I8SSoo5R-58Nz5quebhM15wvypzt8PePprYdwHwYaCKxximhK_CmOc-eDJhktUBFNt9DkafGL0KcMJRBg/file\n",
            "Reusing existing connection to uc39f9f0a1fef0e70b954a219953.dl.dropboxusercontent.com:443.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 87635220 (84M) [application/zip]\n",
            "Saving to: ‘all_data_for_submit.zip?dl=0’\n",
            "\n",
            "all_data_for_submit 100%[===================>]  83.58M  17.5MB/s    in 5.1s    \n",
            "\n",
            "2023-02-14 03:11:27 (16.4 MB/s) - ‘all_data_for_submit.zip?dl=0’ saved [87635220/87635220]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "##RE-DDI\n",
        "easier! downloading takes quite some time. (3.7 Gb)"
      ],
      "metadata": {
        "id": "sFUrSJIfAdte"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Download them and extract them to the checkpoints folder of this project.\n",
        "%cd /content/BioGPT\n",
        "!mkdir checkpoints\n",
        "%cd checkpoints\n",
        "'''RE-DTI-BioGPT.tgz = 3.7G.'''\n",
        "!wget https://msramllasc.blob.core.windows.net/modelrelease/BioGPT/checkpoints/RE-DDI-BioGPT.tgz"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "p1-gOKVR8UPV",
        "outputId": "a455a9a1-b1ee-4ca9-f7b9-1b893fa4b603"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n",
            "/content/BioGPT/checkpoints\n",
            "--2023-03-10 05:57:52--  https://msramllasc.blob.core.windows.net/modelrelease/BioGPT/checkpoints/RE-DDI-BioGPT.tgz\n",
            "Resolving msramllasc.blob.core.windows.net (msramllasc.blob.core.windows.net)... 20.209.34.164\n",
            "Connecting to msramllasc.blob.core.windows.net (msramllasc.blob.core.windows.net)|20.209.34.164|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 3964772956 (3.7G) [application/octet-stream]\n",
            "Saving to: ‘RE-DDI-BioGPT.tgz’\n",
            "\n",
            "RE-DDI-BioGPT.tgz   100%[===================>]   3.69G  6.54MB/s    in 10m 29s \n",
            "\n",
            "2023-03-10 06:08:21 (6.01 MB/s) - ‘RE-DDI-BioGPT.tgz’ saved [3964772956/3964772956]\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import time\n",
        "start_time = time.time()\n",
        "# !tar -zxvf Pre-trained-BioGPT.tgz\n",
        "!tar -zxvf /content/BioGPT/checkpoints/RE-DDI-BioGPT.tgz\n",
        "\n",
        "end_time = time.time()\n",
        "print(f'time was: {end_time - start_time} s')#.format(ent_time - start_time))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "rG0lnNkvAhmH",
        "outputId": "7c6faddf-4443-4778-e180-45afb750ac65"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "RE-DDI-BioGPT/\n",
            "RE-DDI-BioGPT/checkpoint_avg.pt\n",
            "time was: 40.96954131126404 s\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%env MOSES=/content/BioGPT/mosesdecoder\n",
        "%env FASTBPE=/content/BioGPT/fastBPE\n",
        "# %env fastBPE=/content/BioGPT/fastBPE"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5-TH9wb5dJwY",
        "outputId": "2dc4da1d-0364-40f6-82bc-e42722caed2a"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "env: MOSES=/content/BioGPT/mosesdecoder\n",
            "env: FASTBPE=/content/BioGPT/fastBPE\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!echo $MOSES\n",
        "!echo $FASTBPE"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9JaS3poMfrld",
        "outputId": "950c3708-f7ac-4bb7-ca4f-62789d8184a5"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT/mosesdecoder\n",
            "/content/BioGPT/fastBPE\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content/BioGPT/examples/RE-DDI\n",
        "!bash preprocess.sh"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b-WAQ-9uBxzw",
        "outputId": "806ffbae-63c0-4b22-a5be-d3aaf65fff4b"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT/examples/RE-DDI\n",
            "Following PMID in ../../data/DDI/raw/train.json has no extracted triples:\n",
            "DDI-DrugBank.d519 DDI-MedLine.d18 DDI-DrugBank.d491 DDI-MedLine.d4 DDI-DrugBank.d134 DDI-DrugBank.d230 DDI-DrugBank.d259 DDI-DrugBank.d293 DDI-MedLine.d64 DDI-MedLine.d100 DDI-DrugBank.d295 DDI-DrugBank.d402 DDI-MedLine.d101 DDI-DrugBank.d190 DDI-MedLine.d140 DDI-MedLine.d112 DDI-MedLine.d9 DDI-DrugBank.d301 DDI-DrugBank.d128 DDI-DrugBank.d101 DDI-DrugBank.d28 DDI-DrugBank.d376 DDI-MedLine.d28 DDI-DrugBank.d93 DDI-MedLine.d88 DDI-DrugBank.d539 DDI-DrugBank.d525 DDI-DrugBank.d540 DDI-DrugBank.d461 DDI-MedLine.d132 DDI-DrugBank.d360 DDI-MedLine.d43 DDI-MedLine.d121 DDI-DrugBank.d262 DDI-DrugBank.d164 DDI-DrugBank.d534 DDI-DrugBank.d385 DDI-DrugBank.d408 DDI-MedLine.d96 DDI-DrugBank.d285 DDI-DrugBank.d473 DDI-MedLine.d57 DDI-DrugBank.d557 DDI-DrugBank.d161 DDI-DrugBank.d24 DDI-DrugBank.d67 DDI-DrugBank.d490 DDI-DrugBank.d421 DDI-MedLine.d65 DDI-DrugBank.d342 DDI-DrugBank.d264 DDI-MedLine.d10 DDI-DrugBank.d312 DDI-MedLine.d117 DDI-MedLine.d135 DDI-DrugBank.d255 DDI-DrugBank.d390 DDI-DrugBank.d68 DDI-MedLine.d11 DDI-MedLine.d14 DDI-MedLine.d75 DDI-DrugBank.d541 DDI-DrugBank.d118 DDI-MedLine.d50 DDI-DrugBank.d218 DDI-DrugBank.d370 DDI-DrugBank.d201 DDI-DrugBank.d244 DDI-MedLine.d138 DDI-MedLine.d33 DDI-DrugBank.d553 DDI-DrugBank.d125 DDI-DrugBank.d366 DDI-DrugBank.d147 DDI-MedLine.d71 DDI-DrugBank.d363 DDI-MedLine.d32 DDI-MedLine.d76 DDI-DrugBank.d290 DDI-MedLine.d38 DDI-MedLine.d77 DDI-DrugBank.d80 DDI-DrugBank.d27 DDI-MedLine.d120 DDI-DrugBank.d52 DDI-DrugBank.d302 DDI-DrugBank.d486 DDI-DrugBank.d472 DDI-MedLine.d6 DDI-MedLine.d123 DDI-DrugBank.d173 DDI-DrugBank.d570 DDI-DrugBank.d126 DDI-DrugBank.d156 DDI-MedLine.d13 DDI-MedLine.d91 DDI-DrugBank.d349 DDI-DrugBank.d436 DDI-DrugBank.d300 DDI-DrugBank.d432 DDI-MedLine.d52 DDI-DrugBank.d554 DDI-MedLine.d19 DDI-DrugBank.d109 DDI-DrugBank.d63 DDI-DrugBank.d168 DDI-DrugBank.d37 DDI-DrugBank.d50 DDI-DrugBank.d455 DDI-DrugBank.d70 DDI-MedLine.d48 DDI-DrugBank.d515 DDI-DrugBank.d406 DDI-MedLine.d127 DDI-MedLine.d22 DDI-DrugBank.d418 DDI-MedLine.d78 DDI-MedLine.d80 DDI-MedLine.d129 DDI-DrugBank.d61 DDI-DrugBank.d524 DDI-DrugBank.d189 DDI-MedLine.d92 DDI-DrugBank.d6 DDI-DrugBank.d278 DDI-MedLine.d66 DDI-DrugBank.d383 DDI-MedLine.d15 DDI-MedLine.d60 DDI-MedLine.d31 DDI-MedLine.d58 DDI-MedLine.d137 DDI-DrugBank.d555 DDI-DrugBank.d58 DDI-DrugBank.d433 DDI-DrugBank.d375 DDI-DrugBank.d102 DDI-DrugBank.d268 DDI-DrugBank.d391 DDI-MedLine.d83 DDI-DrugBank.d243 DDI-DrugBank.d119 DDI-DrugBank.d49 DDI-MedLine.d139 DDI-DrugBank.d513 DDI-DrugBank.d451 DDI-DrugBank.d38 DDI-DrugBank.d182 DDI-MedLine.d118 DDI-DrugBank.d319 DDI-MedLine.d141 DDI-MedLine.d70 DDI-MedLine.d109 DDI-MedLine.d98 DDI-DrugBank.d214 DDI-DrugBank.d193 DDI-DrugBank.d152 DDI-MedLine.d40 DDI-DrugBank.d535 DDI-DrugBank.d167 DDI-MedLine.d108 DDI-DrugBank.d445 DDI-DrugBank.d235 DDI-DrugBank.d317 DDI-DrugBank.d251 DDI-DrugBank.d496 DDI-DrugBank.d117 DDI-DrugBank.d203 DDI-DrugBank.d532 DDI-DrugBank.d361 DDI-DrugBank.d294 DDI-MedLine.d37 DDI-MedLine.d72 DDI-MedLine.d95 DDI-DrugBank.d280 DDI-MedLine.d26 DDI-MedLine.d74 DDI-DrugBank.d407 DDI-DrugBank.d343 DDI-DrugBank.d209 DDI-DrugBank.d159 DDI-DrugBank.d239 DDI-DrugBank.d155 DDI-DrugBank.d474 DDI-DrugBank.d271 DDI-DrugBank.d403 DDI-DrugBank.d447 DDI-MedLine.d136 DDI-DrugBank.d90 DDI-DrugBank.d136 DDI-MedLine.d41 DDI-DrugBank.d292 DDI-DrugBank.d1 DDI-DrugBank.d92 DDI-DrugBank.d127 \n",
            "664 samples in ../../data/DDI/raw/train.json has been processed with 195 samples has no triples extracted.\n",
            "Following PMID in ../../data/DDI/raw/valid.json has no extracted triples:\n",
            "DDI-DrugBank.d348 DDI-DrugBank.d520 DDI-DrugBank.d248 DDI-MedLine.d122 DDI-MedLine.d103 DDI-MedLine.d35 DDI-MedLine.d24 DDI-DrugBank.d169 DDI-DrugBank.d221 \n",
            "50 samples in ../../data/DDI/raw/valid.json has been processed with 9 samples has no triples extracted.\n",
            "191 samples in ../../data/DDI/raw/test.json has been processed with 0 samples has no triples extracted.\n",
            "Preprocessing train\n",
            "Tokenizer Version 1.1\n",
            "Language: en\n",
            "Number of threads: 8\n",
            "Tokenizer Version 1.1\n",
            "Language: en\n",
            "Number of threads: 8\n",
            "Loading codes from ../../data/DDI/raw/bpecodes ...\n",
            "Read 40000 codes from the codes file.\n",
            "Loading vocabulary from ../../data/DDI/raw/relis_train.tok.x ...\n",
            "Read 116252 words (7707 unique) from text file.\n",
            "Applying BPE to ../../data/DDI/raw/relis_train.tok.x ...\n",
            "Modified 116252 words from text file.\n",
            "Loading codes from ../../data/DDI/raw/bpecodes ...\n",
            "Read 40000 codes from the codes file.\n",
            "Loading vocabulary from ../../data/DDI/raw/relis_train.tok.y ...\n",
            "Read 34391 words (1364 unique) from text file.\n",
            "Applying BPE to ../../data/DDI/raw/relis_train.tok.y ...\n",
            "Modified 34391 words from text file.\n",
            "Preprocessing valid\n",
            "Tokenizer Version 1.1\n",
            "Language: en\n",
            "Number of threads: 8\n",
            "Tokenizer Version 1.1\n",
            "Language: en\n",
            "Number of threads: 8\n",
            "Loading codes from ../../data/DDI/raw/bpecodes ...\n",
            "Read 40000 codes from the codes file.\n",
            "Loading vocabulary from ../../data/DDI/raw/relis_valid.tok.x ...\n",
            "Read 10902 words (1974 unique) from text file.\n",
            "Applying BPE to ../../data/DDI/raw/relis_valid.tok.x ...\n",
            "Modified 10902 words from text file.\n",
            "Loading codes from ../../data/DDI/raw/bpecodes ...\n",
            "Read 40000 codes from the codes file.\n",
            "Loading vocabulary from ../../data/DDI/raw/relis_valid.tok.y ...\n",
            "Read 2976 words (266 unique) from text file.\n",
            "Applying BPE to ../../data/DDI/raw/relis_valid.tok.y ...\n",
            "Modified 2976 words from text file.\n",
            "Preprocessing test\n",
            "Tokenizer Version 1.1\n",
            "Language: en\n",
            "Number of threads: 8\n",
            "Tokenizer Version 1.1\n",
            "Language: en\n",
            "Number of threads: 8\n",
            "Loading codes from ../../data/DDI/raw/bpecodes ...\n",
            "Read 40000 codes from the codes file.\n",
            "Loading vocabulary from ../../data/DDI/raw/relis_test.tok.x ...\n",
            "Read 30412 words (4124 unique) from text file.\n",
            "Applying BPE to ../../data/DDI/raw/relis_test.tok.x ...\n",
            "Modified 30412 words from text file.\n",
            "Loading codes from ../../data/DDI/raw/bpecodes ...\n",
            "Read 40000 codes from the codes file.\n",
            "Loading vocabulary from ../../data/DDI/raw/relis_test.tok.y ...\n",
            "Read 9094 words (703 unique) from text file.\n",
            "Applying BPE to ../../data/DDI/raw/relis_test.tok.y ...\n",
            "Modified 9094 words from text file.\n",
            "2023-03-10 06:09:06 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n",
            "2023-03-10 06:09:07 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None, optimizer=None, lr_scheduler='fixed', scoring='bleu', task='translation', source_lang='x', target_lang='y', trainpref='../../data/DDI/raw/relis_train.tok.bpe', validpref='../../data/DDI/raw/relis_valid.tok.bpe', testpref='../../data/DDI/raw/relis_test.tok.bpe', align_suffix=None, destdir='../../data/DDI/relis-bin', thresholdtgt=0, thresholdsrc=0, tgtdict=None, srcdict='../../data/DDI/raw/dict.txt', nwordstgt=-1, nwordssrc=-1, alignfile=None, dataset_impl='mmap', joined_dictionary=True, only_source=False, padding_factor=8, workers=8, dict_only=False)\n",
            "2023-03-10 06:09:07 | INFO | fairseq_cli.preprocess | [x] Dictionary: 42384 types\n",
            "2023-03-10 06:09:07 | INFO | fairseq_cli.preprocess | [x] ../../data/DDI/raw/relis_train.tok.bpe.x: 469 sents, 139695 tokens, 0.0% replaced (by <unk>)\n",
            "2023-03-10 06:09:07 | INFO | fairseq_cli.preprocess | [x] Dictionary: 42384 types\n",
            "2023-03-10 06:09:07 | INFO | fairseq_cli.preprocess | [x] ../../data/DDI/raw/relis_valid.tok.bpe.x: 41 sents, 12789 tokens, 0.0% replaced (by <unk>)\n",
            "2023-03-10 06:09:07 | INFO | fairseq_cli.preprocess | [x] Dictionary: 42384 types\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | [x] ../../data/DDI/raw/relis_test.tok.bpe.x: 191 sents, 36514 tokens, 0.0% replaced (by <unk>)\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | [y] Dictionary: 42384 types\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | [y] ../../data/DDI/raw/relis_train.tok.bpe.y: 469 sents, 41376 tokens, 0.0% replaced (by <unk>)\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | [y] Dictionary: 42384 types\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | [y] ../../data/DDI/raw/relis_valid.tok.bpe.y: 41 sents, 3472 tokens, 0.0% replaced (by <unk>)\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | [y] Dictionary: 42384 types\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | [y] ../../data/DDI/raw/relis_test.tok.bpe.y: 191 sents, 11107 tokens, 0.0% replaced (by <unk>)\n",
            "2023-03-10 06:09:08 | INFO | fairseq_cli.preprocess | Wrote preprocessed data to ../../data/DDI/relis-bin\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## RUN IT"
      ],
      "metadata": {
        "id": "o8bmraypARoG"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!mkdir /content/BioGPT/data/DDI/relis-bin "
      ],
      "metadata": {
        "id": "4l9a4xVDBHxR",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "6b28cba7-568d-4c56-9421-fb808d6df296"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "mkdir: cannot create directory ‘/content/BioGPT/data/DDI/relis-bin’: File exists\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content/BioGPT"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "KERW2JFJBgfO",
        "outputId": "d0eb5315-0a20-4f27-fc53-0639ae297025"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install fastBPE"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2TQsBmRMl3kO",
        "outputId": "1cbec7ef-77b9-4014-bd16-55b53da130df"
      },
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting fastBPE\n",
            "  Downloading fastBPE-0.1.0.tar.gz (35 kB)\n",
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Building wheels for collected packages: fastBPE\n",
            "  Building wheel for fastBPE (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Created wheel for fastBPE: filename=fastBPE-0.1.0-cp39-cp39-linux_x86_64.whl size=762543 sha256=537ed941d5a2ec01964facec84ec62767a06b1dfdc23d249c9352caab33629fc\n",
            "  Stored in directory: /root/.cache/pip/wheels/e1/10/20/0691b69b472ff8530a7e608674d5bd1cbc772f4d6071c8accf\n",
            "Successfully built fastBPE\n",
            "Installing collected packages: fastBPE\n",
            "Successfully installed fastBPE-0.1.0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "class MyClass:\n",
        "  beam = 1\n",
        "args = MyClass()\n",
        "print(args.beam)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zqMcghHkJyuy",
        "outputId": "74df0aa9-ad90-4552-ffcf-d3fdb09e6a79"
      },
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "1\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "text3 = 'The increased prevalence of obesity and diabetes, with the attendant increase in morbidity and mortality, pose a substantial therapeutic challenge. Genetic screens in lower organisms provide evidence that gain-of-function of the deacetylase Sir2 results in beneficial metabolic effects and lifespan extension. Sirtuin agonists increase metabolic efficiency in rodents through a mechanism bearing similarity with calorie restriction. However, the specificity of these compounds remains undefined.'"
      ],
      "metadata": {
        "id": "7MAy9Q8-D5T2"
      },
      "execution_count": 15,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title DDI\n",
        "!pwd\n",
        "%cd /content/BioGPT\n",
        "from src.transformer_lm_prompt import TransformerLanguageModelPrompt\n",
        "\n",
        "m = TransformerLanguageModelPrompt.from_pretrained(\n",
        "        \"checkpoints/RE-DDI-BioGPT\", \n",
        "        \"checkpoint_avg.pt\", \n",
        "        \"data/DDI/relis-bin\",\n",
        "        tokenizer='moses', \n",
        "        bpe='fastbpe', \n",
        "        bpe_codes=\"data/bpecodes\",\n",
        "        max_len_b=1024,\n",
        "        beam=1)\n",
        "m.cuda()\n",
        "src_text= text3 # input text, e.g., a PubMed abstract\n",
        "src_tokens = m.encode(src_text)\n",
        "generate = m.generate([src_tokens], beam=args.beam)[0]\n",
        "output = m.decode(generate[0][\"tokens\"])\n",
        "print(output)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "la8qhfvHBUHy",
        "outputId": "d0655e16-581b-46ee-dc29-b15c09367d47"
      },
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/BioGPT\n",
            "/content/BioGPT\n",
            "The increased prevalence of obesity and diabetes, with the attendant increase in morbidity and mortality, pose a substantial therapeutic challenge. Genetic screens in lower organisms provide evidence that gain-of-function of the deacetylase Sir2 results in beneficial metabolic effects and lifespan extension. Sirtuin agonists increase metabolic efficiency in rodents through a mechanism bearing similarity with calorie restriction. However, the specificity of these compounds remains undefined. learned1 learned2 learned3 learned4 learned5 learned6 learned7 learned8 learned9 the interaction between Sir2 and calorie restriction is effect; the interaction between Sir2 and antidiabetic agents is effect.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Some output: \"The increased prevalence of obesity and diabetes, with the attendant increase in morbidity and mortality, pose a substantial therapeutic challenge. Genetic screens in lower organisms provide evidence that gain-of-function of the deacetylase Sir2 results in beneficial metabolic effects and lifespan extension. Sirtuin agonists increase metabolic efficiency in rodents through a mechanism bearing similarity with calorie restriction. However, the specificity of these compounds remains undefined. learned1 learned2 learned3 learned4 learned5 learned6 learned7 learned8 learned9 the interaction between Sir2 and calorie restriction is effect; the interaction between Sir2 and antidiabetic agents is effect; the interaction between Sir2 and cns is effect"
      ],
      "metadata": {
        "id": "RxgGOY5YLfWf"
      }
    },
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "xkYBJMiHNZgO"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## clear system RAM\n",
        "(probably not needed)"
      ],
      "metadata": {
        "id": "1Ss8zW9jeNt7"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# !nvidia-smi\n",
        "# !kill process_id\n",
        "# Garbage Collector - use it like gc.collect()\n",
        "import gc\n",
        "import tensorflow as tf\n",
        "# Custom Callback To Include in Callbacks List At Training Time\n",
        "class GarbageCollectorCallback(tf.keras.callbacks.Callback):\n",
        "    def on_epoch_end(self, epoch, logs=None):\n",
        "        gc.collect()"
      ],
      "metadata": {
        "id": "nYQ865TkJUd8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "gc.collect()\n",
        "# Custom Callback To Include in Callbacks List At Training Time\n",
        "class GarbageCollectorCallback(tf.keras.callbacks.Callback):\n",
        "    def on_epoch_end(self, epoch, logs=None):\n",
        "        gc.collect()"
      ],
      "metadata": {
        "id": "5tTBe23JLBTq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from tensorflow.keras import backend as K\n",
        "K.clear_session()"
      ],
      "metadata": {
        "id": "Yngzfvo3LbyI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# # tf.reset_default_graph()\n",
        "# !nvidia-smi\n",
        "# torch.cuda.empty_cache()\n",
        "!pip install numba\n",
        "\n",
        "from numba import cuda \n",
        "device = cuda.get_current_device()\n",
        "device.reset()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "k4_Vi2xQLixZ",
        "outputId": "1ae6bc2e-12e1-425a-b454-2010f6529c0d"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Requirement already satisfied: numba in /usr/local/lib/python3.8/dist-packages (0.56.4)\n",
            "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.8/dist-packages (from numba) (0.39.1)\n",
            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.8/dist-packages (from numba) (6.0.0)\n",
            "Requirement already satisfied: numpy<1.24,>=1.18 in /usr/local/lib/python3.8/dist-packages (from numba) (1.21.6)\n",
            "Requirement already satisfied: setuptools in /usr/local/lib/python3.8/dist-packages (from numba) (57.4.0)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.8/dist-packages (from importlib-metadata->numba) (3.12.0)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "bLGKobf4UsH5"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 3. Huggingface-version\n",
        "\n",
        "I didn't implement this yet. But here some resources.\n",
        "\n",
        ".\n",
        "\n",
        ".\n",
        "\n",
        "https://huggingface.co/docs/transformers/model_doc/biogpt\n",
        "https://github.com/huggingface/transformers/blob/main/src/transformers/models/biogpt/tokenization_biogpt.py\n",
        "\n",
        "https://huggingface.co/docs/transformers/notebooks\n",
        "\n",
        "https://github.com/huggingface/notebooks/tree/main/transformers_doc/en\n"
      ],
      "metadata": {
        "id": "XpvMW7LVUteY"
      }
    }
  ]
 }