Last active
February 17, 2023 08:12
-
-
Save raven44099/ffa83daeb4aedc925edf65e039e2d6fd to your computer and use it in GitHub Desktop.
biogpt_preprocess_sh.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"machine_shape": "hm", | |
"authorship_tag": "ABX9TyMT0+URsVy0AyReHhmLBxP6", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"accelerator": "GPU", | |
"gpuClass": "standard" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/raven44099/ffa83daeb4aedc925edf65e039e2d6fd/biogpt_preprocess_sh.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"## start\n", | |
"https://github.com/microsoft/BioGPT" | |
], | |
"metadata": { | |
"id": "HoJ61YPQ_IxB" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 36 | |
}, | |
"id": "em4vMY2N_BmE", | |
"outputId": "a85e3cfc-be94-4f4e-a38e-0b0dd2b6b6f2" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'1.13.1+cu116'" | |
], | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
} | |
}, | |
"metadata": {}, | |
"execution_count": 1 | |
} | |
], | |
"source": [ | |
"import torch\n", | |
"torch.__version__" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# !pip install fairseq\n", | |
"\n", | |
"!git clone https://github.com/pytorch/fairseq\n", | |
"%cd fairseq\n", | |
"!git checkout v0.12.0\n", | |
"!pip install .\n", | |
"!python setup.py build_ext --inplace\n", | |
"%cd .." | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "lQ7hdoaD_UUI", | |
"outputId": "d6a88761-d1e5-4d2b-9b56-909ac9ae6f16" | |
}, | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cloning into 'fairseq'...\n", | |
"remote: Enumerating objects: 34477, done.\u001b[K\n", | |
"remote: Counting objects: 100% (86/86), done.\u001b[K\n", | |
"remote: Compressing objects: 100% (76/76), done.\u001b[K\n", | |
"remote: Total 34477 (delta 22), reused 39 (delta 8), pack-reused 34391\u001b[K\n", | |
"Receiving objects: 100% (34477/34477), 24.03 MiB | 15.62 MiB/s, done.\n", | |
"Resolving deltas: 100% (25034/25034), done.\n", | |
"/content/fairseq\n", | |
"Note: switching to 'v0.12.0'.\n", | |
"\n", | |
"You are in 'detached HEAD' state. You can look around, make experimental\n", | |
"changes and commit them, and you can discard any commits you make in this\n", | |
"state without impacting any branches by switching back to a branch.\n", | |
"\n", | |
"If you want to create a new branch to retain commits you create, you may\n", | |
"do so (now or later) by using -c with the switch command. Example:\n", | |
"\n", | |
" git switch -c <new-branch-name>\n", | |
"\n", | |
"Or undo this operation with:\n", | |
"\n", | |
" git switch -\n", | |
"\n", | |
"Turn off this advice by setting config variable advice.detachedHead to false\n", | |
"\n", | |
"HEAD is now at 6795311b 0.12.0 release\n", | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Processing /content/fairseq\n", | |
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
"Collecting hydra-core<1.1,>=1.0.7\n", | |
" Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m123.8/123.8 KB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from fairseq==0.12.0) (1.21.6)\n", | |
"Requirement already satisfied: torch in /usr/local/lib/python3.8/dist-packages (from fairseq==0.12.0) (1.13.1+cu116)\n", | |
"Requirement already satisfied: cffi in /usr/local/lib/python3.8/dist-packages (from fairseq==0.12.0) (1.15.1)\n", | |
"Requirement already satisfied: cython in /usr/local/lib/python3.8/dist-packages (from fairseq==0.12.0) (0.29.33)\n", | |
"Requirement already satisfied: torchaudio>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from fairseq==0.12.0) (0.13.1+cu116)\n", | |
"Requirement already satisfied: regex in /usr/local/lib/python3.8/dist-packages (from fairseq==0.12.0) (2022.6.2)\n", | |
"Collecting sacrebleu>=1.4.12\n", | |
" Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m118.9/118.9 KB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from fairseq==0.12.0) (4.64.1)\n", | |
"Collecting bitarray\n", | |
" Downloading bitarray-2.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (269 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m269.5/269.5 KB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25hCollecting omegaconf<2.1\n", | |
" Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)\n", | |
"Collecting antlr4-python3-runtime==4.8\n", | |
" Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.4/112.4 KB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: importlib-resources in /usr/local/lib/python3.8/dist-packages (from hydra-core<1.1,>=1.0.7->fairseq==0.12.0) (5.10.2)\n", | |
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.8/dist-packages (from omegaconf<2.1->fairseq==0.12.0) (4.4.0)\n", | |
"Requirement already satisfied: PyYAML>=5.1.* in /usr/local/lib/python3.8/dist-packages (from omegaconf<2.1->fairseq==0.12.0) (6.0)\n", | |
"Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.8/dist-packages (from sacrebleu>=1.4.12->fairseq==0.12.0) (0.8.10)\n", | |
"Collecting colorama\n", | |
" Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", | |
"Collecting portalocker\n", | |
" Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)\n", | |
"Requirement already satisfied: lxml in /usr/local/lib/python3.8/dist-packages (from sacrebleu>=1.4.12->fairseq==0.12.0) (4.9.2)\n", | |
"Requirement already satisfied: pycparser in /usr/local/lib/python3.8/dist-packages (from cffi->fairseq==0.12.0) (2.21)\n", | |
"Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.8/dist-packages (from importlib-resources->hydra-core<1.1,>=1.0.7->fairseq==0.12.0) (3.12.1)\n", | |
"Building wheels for collected packages: fairseq, antlr4-python3-runtime\n", | |
" Building wheel for fairseq (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for fairseq: filename=fairseq-0.12.0-cp38-cp38-linux_x86_64.whl size=19090779 sha256=49f6b9dd0f62e68b052837bf62688b83566c383536ae15664af74abd54771643\n", | |
" Stored in directory: /tmp/pip-ephem-wheel-cache-bi4c2zky/wheels/45/ac/c1/5c3c02c0e0520a71d95d020995fe3cecb9b9185ac4a3832ef6\n", | |
" Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141231 sha256=3992c80cfdf10f234e9cc4624e01ac7a64df041cdb7f28144ce3832c37b69cd6\n", | |
" Stored in directory: /root/.cache/pip/wheels/c8/d0/ab/d43c02eaddc5b9004db86950802442ad9a26f279c619e28da0\n", | |
"Successfully built fairseq antlr4-python3-runtime\n", | |
"Installing collected packages: bitarray, antlr4-python3-runtime, portalocker, omegaconf, colorama, sacrebleu, hydra-core, fairseq\n", | |
"Successfully installed antlr4-python3-runtime-4.8 bitarray-2.7.2 colorama-0.4.6 fairseq-0.12.0 hydra-core-1.0.7 omegaconf-2.0.6 portalocker-2.7.0 sacrebleu-2.3.1\n", | |
"running build_ext\n", | |
"/usr/local/lib/python3.8/dist-packages/torch/utils/cpp_extension.py:476: UserWarning: Attempted to use ninja as the BuildExtension backend but we could not find ninja.. Falling back to using the slow distutils backend.\n", | |
" warnings.warn(msg.format('we could not find ninja.'))\n", | |
"skipping 'fairseq/data/data_utils_fast.cpp' Cython extension (up-to-date)\n", | |
"skipping 'fairseq/data/token_block_utils_fast.cpp' Cython extension (up-to-date)\n", | |
"copying build/lib.linux-x86_64-3.8/fairseq/libbleu.cpython-38-x86_64-linux-gnu.so -> fairseq\n", | |
"copying build/lib.linux-x86_64-3.8/fairseq/data/data_utils_fast.cpython-38-x86_64-linux-gnu.so -> fairseq/data\n", | |
"copying build/lib.linux-x86_64-3.8/fairseq/data/token_block_utils_fast.cpython-38-x86_64-linux-gnu.so -> fairseq/data\n", | |
"copying build/lib.linux-x86_64-3.8/fairseq/libbase.cpython-38-x86_64-linux-gnu.so -> fairseq\n", | |
"copying build/lib.linux-x86_64-3.8/fairseq/libnat.cpython-38-x86_64-linux-gnu.so -> fairseq\n", | |
"copying build/lib.linux-x86_64-3.8/alignment_train_cpu_binding.cpython-38-x86_64-linux-gnu.so -> \n", | |
"/content\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"PWD = '/content'" | |
], | |
"metadata": { | |
"id": "eXUhlZVAFV6Y" | |
}, | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%cd /content\n", | |
"!git clone https://github.com/microsoft/BioGPT.git" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "RY-qzvjkGZEM", | |
"outputId": "95400d22-a655-4cee-bc03-bea155559014" | |
}, | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"/content\n", | |
"Cloning into 'BioGPT'...\n", | |
"remote: Enumerating objects: 341, done.\u001b[K\n", | |
"remote: Counting objects: 100% (70/70), done.\u001b[K\n", | |
"remote: Compressing objects: 100% (33/33), done.\u001b[K\n", | |
"remote: Total 341 (delta 56), reused 37 (delta 37), pack-reused 271\u001b[K\n", | |
"Receiving objects: 100% (341/341), 31.44 MiB | 33.78 MiB/s, done.\n", | |
"Resolving deltas: 100% (175/175), done.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title prepare package Moses\n", | |
"%cd /content/BioGPT\n", | |
"!git clone https://github.com/moses-smt/mosesdecoder.git\n", | |
"!export MOSES=${PWD}/mosesdecoder" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Jy4XY-zwAesc", | |
"outputId": "53ffd9d8-85a2-4a06-fecd-53c9a4d75b15" | |
}, | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cloning into 'mosesdecoder'...\n", | |
"remote: Enumerating objects: 148097, done.\u001b[K\n", | |
"remote: Counting objects: 100% (525/525), done.\u001b[K\n", | |
"remote: Compressing objects: 100% (229/229), done.\u001b[K\n", | |
"remote: Total 148097 (delta 323), reused 441 (delta 292), pack-reused 147572\u001b[K\n", | |
"Receiving objects: 100% (148097/148097), 129.88 MiB | 24.27 MiB/s, done.\n", | |
"Resolving deltas: 100% (114349/114349), done.\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"#@title setup package fasBPE\n", | |
"%cd /content/BioGPT\n", | |
"!git clone https://github.com/glample/fastBPE.git\n", | |
"!export FASTBPE=${PWD}/fastBPE\n", | |
"%cd fastBPE\n", | |
"!g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "XEfxn6icAnYX", | |
"outputId": "50a8f458-97a6-4ea5-c960-b85583afc240" | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cloning into 'fastBPE'...\n", | |
"remote: Enumerating objects: 59, done.\u001b[K\n", | |
"Unpacking objects: 1% (1/59)\rUnpacking objects: 3% (2/59)\rUnpacking objects: 5% (3/59)\rUnpacking objects: 6% (4/59)\rUnpacking objects: 8% (5/59)\rUnpacking objects: 10% (6/59)\rUnpacking objects: 11% (7/59)\rUnpacking objects: 13% (8/59)\rUnpacking objects: 15% (9/59)\rUnpacking objects: 16% (10/59)\rUnpacking objects: 18% (11/59)\rUnpacking objects: 20% (12/59)\rUnpacking objects: 22% (13/59)\rUnpacking objects: 23% (14/59)\rUnpacking objects: 25% (15/59)\rUnpacking objects: 27% (16/59)\rUnpacking objects: 28% (17/59)\rUnpacking objects: 30% (18/59)\rUnpacking objects: 32% (19/59)\rUnpacking objects: 33% (20/59)\rUnpacking objects: 35% (21/59)\rUnpacking objects: 37% (22/59)\rremote: Total 59 (delta 0), reused 0 (delta 0), pack-reused 59\u001b[K\n", | |
"Unpacking objects: 38% (23/59)\rUnpacking objects: 40% (24/59)\rUnpacking objects: 42% (25/59)\rUnpacking objects: 44% (26/59)\rUnpacking objects: 45% (27/59)\rUnpacking objects: 47% (28/59)\rUnpacking objects: 49% (29/59)\rUnpacking objects: 50% (30/59)\rUnpacking objects: 52% (31/59)\rUnpacking objects: 54% (32/59)\rUnpacking objects: 55% (33/59)\rUnpacking objects: 57% (34/59)\rUnpacking objects: 59% (35/59)\rUnpacking objects: 61% (36/59)\rUnpacking objects: 62% (37/59)\rUnpacking objects: 64% (38/59)\rUnpacking objects: 66% (39/59)\rUnpacking objects: 67% (40/59)\rUnpacking objects: 69% (41/59)\rUnpacking objects: 71% (42/59)\rUnpacking objects: 72% (43/59)\rUnpacking objects: 74% (44/59)\rUnpacking objects: 76% (45/59)\rUnpacking objects: 77% (46/59)\rUnpacking objects: 79% (47/59)\rUnpacking objects: 81% (48/59)\rUnpacking objects: 83% (49/59)\rUnpacking objects: 84% (50/59)\rUnpacking objects: 86% (51/59)\rUnpacking objects: 88% (52/59)\rUnpacking objects: 89% (53/59)\rUnpacking objects: 91% (54/59)\rUnpacking objects: 93% (55/59)\rUnpacking objects: 94% (56/59)\rUnpacking objects: 96% (57/59)\rUnpacking objects: 98% (58/59)\rUnpacking objects: 100% (59/59)\rUnpacking objects: 100% (59/59), 29.97 KiB | 1.36 MiB/s, done.\n", | |
"/content/fastBPE\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%cd ..\n", | |
"!pip install sacremoses\n", | |
"!pip install scikit-learn" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "whLrxejtBmWg", | |
"outputId": "bec42aec-3e56-411c-83af-c336fe813ebb" | |
}, | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"/content\n", | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Collecting sacremoses\n", | |
" Downloading sacremoses-0.0.53.tar.gz (880 kB)\n", | |
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m880.6/880.6 KB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
"Requirement already satisfied: regex in /usr/local/lib/python3.8/dist-packages (from sacremoses) (2022.6.2)\n", | |
"Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from sacremoses) (1.15.0)\n", | |
"Requirement already satisfied: click in /usr/local/lib/python3.8/dist-packages (from sacremoses) (7.1.2)\n", | |
"Requirement already satisfied: joblib in /usr/local/lib/python3.8/dist-packages (from sacremoses) (1.2.0)\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from sacremoses) (4.64.1)\n", | |
"Building wheels for collected packages: sacremoses\n", | |
" Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
" Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=95920892762075fb73c0e643b17b2e78c4fd077c4bcb2d889cdef48e7ffc80a7\n", | |
" Stored in directory: /root/.cache/pip/wheels/82/ab/9b/c15899bf659ba74f623ac776e861cf2eb8608c1825ddec66a4\n", | |
"Successfully built sacremoses\n", | |
"Installing collected packages: sacremoses\n", | |
"Successfully installed sacremoses-0.0.53\n", | |
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", | |
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.8/dist-packages (1.0.2)\n", | |
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (3.1.0)\n", | |
"Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (1.7.3)\n", | |
"Requirement already satisfied: numpy>=1.14.6 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (1.21.6)\n", | |
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from scikit-learn) (1.2.0)\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# import sacremoses\n", | |
"# import sklearn" | |
], | |
"metadata": { | |
"id": "48HkJKjVGV_H" | |
}, | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%env MOSES=/content/BioGPT/mosesdecoder\n", | |
"%env FASTBPE=/content/BioGPT/fastBPE" | |
], | |
"metadata": { | |
"id": "57Nk1RREZ6S9", | |
"outputId": "bc704d32-54b9-4418-e8e2-ebf8365e3952", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"env: MOSES=/content/BioGPT/mosesdecoder\n", | |
"env: FASTBPE=/content/BioGPT/fastBPE\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!echo $MOSES\n", | |
"!echo $FASTBPE" | |
], | |
"metadata": { | |
"id": "H60IBPQhWVoK", | |
"outputId": "bde68687-efe2-4329-d0c5-858f90d9a23d", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"/content/BioGPT/mosesdecoder\n", | |
"/content/BioGPT/fastBPE\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%cd /content/BioGPT/examples/RE-DDI\n", | |
"!ls\n", | |
"!bash preprocess.sh" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "pyZIqn8RGjX5", | |
"outputId": "ee4eafb3-9fcd-4852-9ff1-22728a5adbe6" | |
}, | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"/content/BioGPT/examples/RE-DDI\n", | |
"hard_match_evaluation.py postprocess.py README.md\t train.sh\n", | |
"infer.sh\t\t preprocess.sh rebuild_data.py\n", | |
"Following PMID in ../../data/DDI/raw/train.json has no extracted triples:\n", | |
"DDI-DrugBank.d519 DDI-MedLine.d18 DDI-DrugBank.d491 DDI-MedLine.d4 DDI-DrugBank.d134 DDI-DrugBank.d230 DDI-DrugBank.d259 DDI-DrugBank.d293 DDI-MedLine.d64 DDI-MedLine.d100 DDI-DrugBank.d295 DDI-DrugBank.d402 DDI-MedLine.d101 DDI-DrugBank.d190 DDI-MedLine.d140 DDI-MedLine.d112 DDI-MedLine.d9 DDI-DrugBank.d301 DDI-DrugBank.d128 DDI-DrugBank.d101 DDI-DrugBank.d28 DDI-DrugBank.d376 DDI-MedLine.d28 DDI-DrugBank.d93 DDI-MedLine.d88 DDI-DrugBank.d539 DDI-DrugBank.d525 DDI-DrugBank.d540 DDI-DrugBank.d461 DDI-MedLine.d132 DDI-DrugBank.d360 DDI-MedLine.d43 DDI-MedLine.d121 DDI-DrugBank.d262 DDI-DrugBank.d164 DDI-DrugBank.d534 DDI-DrugBank.d385 DDI-DrugBank.d408 DDI-MedLine.d96 DDI-DrugBank.d285 DDI-DrugBank.d473 DDI-MedLine.d57 DDI-DrugBank.d557 DDI-DrugBank.d161 DDI-DrugBank.d24 DDI-DrugBank.d67 DDI-DrugBank.d490 DDI-DrugBank.d421 DDI-MedLine.d65 DDI-DrugBank.d342 DDI-DrugBank.d264 DDI-MedLine.d10 DDI-DrugBank.d312 DDI-MedLine.d117 DDI-MedLine.d135 DDI-DrugBank.d255 DDI-DrugBank.d390 DDI-DrugBank.d68 DDI-MedLine.d11 DDI-MedLine.d14 DDI-MedLine.d75 DDI-DrugBank.d541 DDI-DrugBank.d118 DDI-MedLine.d50 DDI-DrugBank.d218 DDI-DrugBank.d370 DDI-DrugBank.d201 DDI-DrugBank.d244 DDI-MedLine.d138 DDI-MedLine.d33 DDI-DrugBank.d553 DDI-DrugBank.d125 DDI-DrugBank.d366 DDI-DrugBank.d147 DDI-MedLine.d71 DDI-DrugBank.d363 DDI-MedLine.d32 DDI-MedLine.d76 DDI-DrugBank.d290 DDI-MedLine.d38 DDI-MedLine.d77 DDI-DrugBank.d80 DDI-DrugBank.d27 DDI-MedLine.d120 DDI-DrugBank.d52 DDI-DrugBank.d302 DDI-DrugBank.d486 DDI-DrugBank.d472 DDI-MedLine.d6 DDI-MedLine.d123 DDI-DrugBank.d173 DDI-DrugBank.d570 DDI-DrugBank.d126 DDI-DrugBank.d156 DDI-MedLine.d13 DDI-MedLine.d91 DDI-DrugBank.d349 DDI-DrugBank.d436 DDI-DrugBank.d300 DDI-DrugBank.d432 DDI-MedLine.d52 DDI-DrugBank.d554 DDI-MedLine.d19 DDI-DrugBank.d109 DDI-DrugBank.d63 DDI-DrugBank.d168 DDI-DrugBank.d37 DDI-DrugBank.d50 DDI-DrugBank.d455 DDI-DrugBank.d70 DDI-MedLine.d48 DDI-DrugBank.d515 DDI-DrugBank.d406 DDI-MedLine.d127 DDI-MedLine.d22 DDI-DrugBank.d418 DDI-MedLine.d78 DDI-MedLine.d80 DDI-MedLine.d129 DDI-DrugBank.d61 DDI-DrugBank.d524 DDI-DrugBank.d189 DDI-MedLine.d92 DDI-DrugBank.d6 DDI-DrugBank.d278 DDI-MedLine.d66 DDI-DrugBank.d383 DDI-MedLine.d15 DDI-MedLine.d60 DDI-MedLine.d31 DDI-MedLine.d58 DDI-MedLine.d137 DDI-DrugBank.d555 DDI-DrugBank.d58 DDI-DrugBank.d433 DDI-DrugBank.d375 DDI-DrugBank.d102 DDI-DrugBank.d268 DDI-DrugBank.d391 DDI-MedLine.d83 DDI-DrugBank.d243 DDI-DrugBank.d119 DDI-DrugBank.d49 DDI-MedLine.d139 DDI-DrugBank.d513 DDI-DrugBank.d451 DDI-DrugBank.d38 DDI-DrugBank.d182 DDI-MedLine.d118 DDI-DrugBank.d319 DDI-MedLine.d141 DDI-MedLine.d70 DDI-MedLine.d109 DDI-MedLine.d98 DDI-DrugBank.d214 DDI-DrugBank.d193 DDI-DrugBank.d152 DDI-MedLine.d40 DDI-DrugBank.d535 DDI-DrugBank.d167 DDI-MedLine.d108 DDI-DrugBank.d445 DDI-DrugBank.d235 DDI-DrugBank.d317 DDI-DrugBank.d251 DDI-DrugBank.d496 DDI-DrugBank.d117 DDI-DrugBank.d203 DDI-DrugBank.d532 DDI-DrugBank.d361 DDI-DrugBank.d294 DDI-MedLine.d37 DDI-MedLine.d72 DDI-MedLine.d95 DDI-DrugBank.d280 DDI-MedLine.d26 DDI-MedLine.d74 DDI-DrugBank.d407 DDI-DrugBank.d343 DDI-DrugBank.d209 DDI-DrugBank.d159 DDI-DrugBank.d239 DDI-DrugBank.d155 DDI-DrugBank.d474 DDI-DrugBank.d271 DDI-DrugBank.d403 DDI-DrugBank.d447 DDI-MedLine.d136 DDI-DrugBank.d90 DDI-DrugBank.d136 DDI-MedLine.d41 DDI-DrugBank.d292 DDI-DrugBank.d1 DDI-DrugBank.d92 DDI-DrugBank.d127 \n", | |
"664 samples in ../../data/DDI/raw/train.json has been processed with 195 samples has no triples extracted.\n", | |
"Following PMID in ../../data/DDI/raw/valid.json has no extracted triples:\n", | |
"DDI-DrugBank.d348 DDI-DrugBank.d520 DDI-DrugBank.d248 DDI-MedLine.d122 DDI-MedLine.d103 DDI-MedLine.d35 DDI-MedLine.d24 DDI-DrugBank.d169 DDI-DrugBank.d221 \n", | |
"50 samples in ../../data/DDI/raw/valid.json has been processed with 9 samples has no triples extracted.\n", | |
"191 samples in ../../data/DDI/raw/test.json has been processed with 0 samples has no triples extracted.\n", | |
"Preprocessing train\n", | |
"Tokenizer Version 1.1\n", | |
"Language: en\n", | |
"Number of threads: 8\n", | |
"Tokenizer Version 1.1\n", | |
"Language: en\n", | |
"Number of threads: 8\n", | |
"Loading codes from ../../data/DDI/raw/bpecodes ...\n", | |
"Read 40000 codes from the codes file.\n", | |
"Loading vocabulary from ../../data/DDI/raw/relis_train.tok.x ...\n", | |
"Read 116252 words (7707 unique) from text file.\n", | |
"Applying BPE to ../../data/DDI/raw/relis_train.tok.x ...\n", | |
"Modified 116252 words from text file.\n", | |
"Loading codes from ../../data/DDI/raw/bpecodes ...\n", | |
"Read 40000 codes from the codes file.\n", | |
"Loading vocabulary from ../../data/DDI/raw/relis_train.tok.y ...\n", | |
"Read 34391 words (1364 unique) from text file.\n", | |
"Applying BPE to ../../data/DDI/raw/relis_train.tok.y ...\n", | |
"Modified 34391 words from text file.\n", | |
"Preprocessing valid\n", | |
"Tokenizer Version 1.1\n", | |
"Language: en\n", | |
"Number of threads: 8\n", | |
"Tokenizer Version 1.1\n", | |
"Language: en\n", | |
"Number of threads: 8\n", | |
"Loading codes from ../../data/DDI/raw/bpecodes ...\n", | |
"Read 40000 codes from the codes file.\n", | |
"Loading vocabulary from ../../data/DDI/raw/relis_valid.tok.x ...\n", | |
"Read 10902 words (1974 unique) from text file.\n", | |
"Applying BPE to ../../data/DDI/raw/relis_valid.tok.x ...\n", | |
"Modified 10902 words from text file.\n", | |
"Loading codes from ../../data/DDI/raw/bpecodes ...\n", | |
"Read 40000 codes from the codes file.\n", | |
"Loading vocabulary from ../../data/DDI/raw/relis_valid.tok.y ...\n", | |
"Read 2976 words (266 unique) from text file.\n", | |
"Applying BPE to ../../data/DDI/raw/relis_valid.tok.y ...\n", | |
"Modified 2976 words from text file.\n", | |
"Preprocessing test\n", | |
"Tokenizer Version 1.1\n", | |
"Language: en\n", | |
"Number of threads: 8\n", | |
"Tokenizer Version 1.1\n", | |
"Language: en\n", | |
"Number of threads: 8\n", | |
"Loading codes from ../../data/DDI/raw/bpecodes ...\n", | |
"Read 40000 codes from the codes file.\n", | |
"Loading vocabulary from ../../data/DDI/raw/relis_test.tok.x ...\n", | |
"Read 30412 words (4124 unique) from text file.\n", | |
"Applying BPE to ../../data/DDI/raw/relis_test.tok.x ...\n", | |
"Modified 30412 words from text file.\n", | |
"Loading codes from ../../data/DDI/raw/bpecodes ...\n", | |
"Read 40000 codes from the codes file.\n", | |
"Loading vocabulary from ../../data/DDI/raw/relis_test.tok.y ...\n", | |
"Read 9094 words (703 unique) from text file.\n", | |
"Applying BPE to ../../data/DDI/raw/relis_test.tok.y ...\n", | |
"Modified 9094 words from text file.\n", | |
"2023-02-17 08:08:05 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX\n", | |
"2023-02-17 08:08:05 | INFO | fairseq_cli.preprocess | Namespace(aim_repo=None, aim_run_hash=None, align_suffix=None, alignfile=None, all_gather_list_size=16384, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, azureml_logging=False, bf16=False, bpe=None, cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='../../data/DDI/relis-bin', dict_only=False, empty_cache_freq=0, fp16=False, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=True, log_file=None, log_format=None, log_interval=100, lr_scheduler='fixed', memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, model_parallel_size=1, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, on_cpu_convert_precision=False, only_source=False, optimizer=None, padding_factor=8, plasma_path='/tmp/plasma', profile=False, quantization_config_path=None, reset_logging=False, scoring='bleu', seed=1, source_lang='x', srcdict='../../data/DDI/raw/dict.txt', suppress_crashes=False, target_lang='y', task='translation', tensorboard_logdir=None, testpref='../../data/DDI/raw/relis_test.tok.bpe', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, tpu=False, trainpref='../../data/DDI/raw/relis_train.tok.bpe', use_plasma_view=False, user_dir=None, validpref='../../data/DDI/raw/relis_valid.tok.bpe', wandb_project=None, workers=8)\n", | |
"2023-02-17 08:08:05 | INFO | fairseq_cli.preprocess | [x] Dictionary: 42384 types\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [x] ../../data/DDI/raw/relis_train.tok.bpe.x: 469 sents, 139695 tokens, 0.0% replaced (by <unk>)\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [x] Dictionary: 42384 types\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [x] ../../data/DDI/raw/relis_valid.tok.bpe.x: 41 sents, 12789 tokens, 0.0% replaced (by <unk>)\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [x] Dictionary: 42384 types\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [x] ../../data/DDI/raw/relis_test.tok.bpe.x: 191 sents, 36514 tokens, 0.0% replaced (by <unk>)\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [y] Dictionary: 42384 types\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [y] ../../data/DDI/raw/relis_train.tok.bpe.y: 469 sents, 41376 tokens, 0.0% replaced (by <unk>)\n", | |
"2023-02-17 08:08:06 | INFO | fairseq_cli.preprocess | [y] Dictionary: 42384 types\n", | |
"2023-02-17 08:08:07 | INFO | fairseq_cli.preprocess | [y] ../../data/DDI/raw/relis_valid.tok.bpe.y: 41 sents, 3472 tokens, 0.0% replaced (by <unk>)\n", | |
"2023-02-17 08:08:07 | INFO | fairseq_cli.preprocess | [y] Dictionary: 42384 types\n", | |
"2023-02-17 08:08:07 | INFO | fairseq_cli.preprocess | [y] ../../data/DDI/raw/relis_test.tok.bpe.y: 191 sents, 11107 tokens, 0.0% replaced (by <unk>)\n", | |
"2023-02-17 08:08:07 | INFO | fairseq_cli.preprocess | Wrote preprocessed data to ../../data/DDI/relis-bin\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment