110CodingP · October 16, 2024 15:12
diff --git a/dataset1_knn_gaussian.ipynb b/dataset1_knn_gaussian.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyPAAz1LnYVeWXT+tSQXLxy6",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/110CodingP/fb791830a025e1f366424e0df217ea51/dataset1_knn_gaussian.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "RQ4kD8nee0Nw"
      },
      "outputs": [],
      "source": [
        "! pip install -q kaggle"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import files\n",
        "\n",
        "files.upload()"
      ],
      "metadata": {
        "id": "0BFTikjYf-lY",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 92
        },
        "outputId": "9b4ead35-9a67-4fe2-ecb4-ee670805ceb8"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "     <input type=\"file\" id=\"files-7765420d-275f-4a85-aa3f-ba7227ecc9a2\" name=\"files[]\" multiple disabled\n",
              "        style=\"border:none\" />\n",
              "     <output id=\"result-7765420d-275f-4a85-aa3f-ba7227ecc9a2\">\n",
              "      Upload widget is only available when the cell has been executed in the\n",
              "      current browser session. Please rerun this cell to enable.\n",
              "      </output>\n",
              "      <script>// Copyright 2017 Google LLC\n",
              "//\n",
              "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
              "// you may not use this file except in compliance with the License.\n",
              "// You may obtain a copy of the License at\n",
              "//\n",
              "//      http://www.apache.org/licenses/LICENSE-2.0\n",
              "//\n",
              "// Unless required by applicable law or agreed to in writing, software\n",
              "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
              "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
              "// See the License for the specific language governing permissions and\n",
              "// limitations under the License.\n",
              "\n",
              "/**\n",
              " * @fileoverview Helpers for google.colab Python module.\n",
              " */\n",
              "(function(scope) {\n",
              "function span(text, styleAttributes = {}) {\n",
              "  const element = document.createElement('span');\n",
              "  element.textContent = text;\n",
              "  for (const key of Object.keys(styleAttributes)) {\n",
              "    element.style[key] = styleAttributes[key];\n",
              "  }\n",
              "  return element;\n",
              "}\n",
              "\n",
              "// Max number of bytes which will be uploaded at a time.\n",
              "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
              "\n",
              "function _uploadFiles(inputId, outputId) {\n",
              "  const steps = uploadFilesStep(inputId, outputId);\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  // Cache steps on the outputElement to make it available for the next call\n",
              "  // to uploadFilesContinue from Python.\n",
              "  outputElement.steps = steps;\n",
              "\n",
              "  return _uploadFilesContinue(outputId);\n",
              "}\n",
              "\n",
              "// This is roughly an async generator (not supported in the browser yet),\n",
              "// where there are multiple asynchronous steps and the Python side is going\n",
              "// to poll for completion of each step.\n",
              "// This uses a Promise to block the python side on completion of each step,\n",
              "// then passes the result of the previous step as the input to the next step.\n",
              "function _uploadFilesContinue(outputId) {\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  const steps = outputElement.steps;\n",
              "\n",
              "  const next = steps.next(outputElement.lastPromiseValue);\n",
              "  return Promise.resolve(next.value.promise).then((value) => {\n",
              "    // Cache the last promise value to make it available to the next\n",
              "    // step of the generator.\n",
              "    outputElement.lastPromiseValue = value;\n",
              "    return next.value.response;\n",
              "  });\n",
              "}\n",
              "\n",
              "/**\n",
              " * Generator function which is called between each async step of the upload\n",
              " * process.\n",
              " * @param {string} inputId Element ID of the input file picker element.\n",
              " * @param {string} outputId Element ID of the output display.\n",
              " * @return {!Iterable<!Object>} Iterable of next steps.\n",
              " */\n",
              "function* uploadFilesStep(inputId, outputId) {\n",
              "  const inputElement = document.getElementById(inputId);\n",
              "  inputElement.disabled = false;\n",
              "\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  outputElement.innerHTML = '';\n",
              "\n",
              "  const pickedPromise = new Promise((resolve) => {\n",
              "    inputElement.addEventListener('change', (e) => {\n",
              "      resolve(e.target.files);\n",
              "    });\n",
              "  });\n",
              "\n",
              "  const cancel = document.createElement('button');\n",
              "  inputElement.parentElement.appendChild(cancel);\n",
              "  cancel.textContent = 'Cancel upload';\n",
              "  const cancelPromise = new Promise((resolve) => {\n",
              "    cancel.onclick = () => {\n",
              "      resolve(null);\n",
              "    };\n",
              "  });\n",
              "\n",
              "  // Wait for the user to pick the files.\n",
              "  const files = yield {\n",
              "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
              "    response: {\n",
              "      action: 'starting',\n",
              "    }\n",
              "  };\n",
              "\n",
              "  cancel.remove();\n",
              "\n",
              "  // Disable the input element since further picks are not allowed.\n",
              "  inputElement.disabled = true;\n",
              "\n",
              "  if (!files) {\n",
              "    return {\n",
              "      response: {\n",
              "        action: 'complete',\n",
              "      }\n",
              "    };\n",
              "  }\n",
              "\n",
              "  for (const file of files) {\n",
              "    const li = document.createElement('li');\n",
              "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
              "    li.append(span(\n",
              "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
              "        `last modified: ${\n",
              "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
              "                                    'n/a'} - `));\n",
              "    const percent = span('0% done');\n",
              "    li.appendChild(percent);\n",
              "\n",
              "    outputElement.appendChild(li);\n",
              "\n",
              "    const fileDataPromise = new Promise((resolve) => {\n",
              "      const reader = new FileReader();\n",
              "      reader.onload = (e) => {\n",
              "        resolve(e.target.result);\n",
              "      };\n",
              "      reader.readAsArrayBuffer(file);\n",
              "    });\n",
              "    // Wait for the data to be ready.\n",
              "    let fileData = yield {\n",
              "      promise: fileDataPromise,\n",
              "      response: {\n",
              "        action: 'continue',\n",
              "      }\n",
              "    };\n",
              "\n",
              "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
              "    let position = 0;\n",
              "    do {\n",
              "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
              "      const chunk = new Uint8Array(fileData, position, length);\n",
              "      position += length;\n",
              "\n",
              "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
              "      yield {\n",
              "        response: {\n",
              "          action: 'append',\n",
              "          file: file.name,\n",
              "          data: base64,\n",
              "        },\n",
              "      };\n",
              "\n",
              "      let percentDone = fileData.byteLength === 0 ?\n",
              "          100 :\n",
              "          Math.round((position / fileData.byteLength) * 100);\n",
              "      percent.textContent = `${percentDone}% done`;\n",
              "\n",
              "    } while (position < fileData.byteLength);\n",
              "  }\n",
              "\n",
              "  // All done.\n",
              "  yield {\n",
              "    response: {\n",
              "      action: 'complete',\n",
              "    }\n",
              "  };\n",
              "}\n",
              "\n",
              "scope.google = scope.google || {};\n",
              "scope.google.colab = scope.google.colab || {};\n",
              "scope.google.colab._files = {\n",
              "  _uploadFiles,\n",
              "  _uploadFilesContinue,\n",
              "};\n",
              "})(self);\n",
              "</script> "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saving kaggle.json to kaggle.json\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'kaggle.json': b'{\"username\":\"codingp110\",\"key\":\"81f210dea3939d586d081537b5076f96\"}'}"
            ]
          },
          "metadata": {},
          "execution_count": 2
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "! mkdir ~/.kaggle\n",
        "\n",
        "! cp kaggle.json ~/.kaggle/"
      ],
      "metadata": {
        "id": "yMXfjphbgGxV"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "! chmod 600 ~/.kaggle/kaggle.json"
      ],
      "metadata": {
        "id": "l9PCFxiYgJBB"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "! kaggle datasets download codingp110/emoticons"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qzCwlx1wgLn6",
        "outputId": "5e63512a-29b6-469e-e313-494cc18dc7b0"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Dataset URL: https://www.kaggle.com/datasets/codingp110/emoticons\n",
            "License(s): unknown\n",
            "Downloading emoticons.zip to /content\n",
            "100% 133k/133k [00:00<00:00, 415kB/s]\n",
            "100% 133k/133k [00:00<00:00, 415kB/s]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "! unzip /content/emoticons.zip"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "g0T-GLsLgu5b",
        "outputId": "818d8859-a103-42dd-d7a1-df65e21150fd"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Archive:  /content/emoticons.zip\n",
            "  inflating: test_emoticon.csv       \n",
            "  inflating: train_emoticon.csv      \n",
            "  inflating: valid_emoticon.csv      \n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "! pip install catboost"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Uz4ds1crg0Ln",
        "outputId": "03830e62-474a-4879-a252-88bacece6f63"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting catboost\n",
            "  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)\n",
            "Requirement already satisfied: graphviz in /usr/local/lib/python3.10/dist-packages (from catboost) (0.20.3)\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from catboost) (3.7.1)\n",
            "Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from catboost) (1.26.4)\n",
            "Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.10/dist-packages (from catboost) (2.2.2)\n",
            "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from catboost) (1.13.1)\n",
            "Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (from catboost) (5.24.1)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from catboost) (1.16.0)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)\n",
            "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)\n",
            "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.3.0)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (0.12.1)\n",
            "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (4.54.1)\n",
            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.4.7)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (24.1)\n",
            "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (10.4.0)\n",
            "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (3.1.4)\n",
            "Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly->catboost) (9.0.0)\n",
            "Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 MB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: catboost\n",
            "Successfully installed catboost-1.2.7\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.model_selection import train_test_split, GridSearchCV\n",
        "from sklearn.preprocessing import (StandardScaler, OneHotEncoder)\n",
        "from sklearn.metrics import accuracy_score\n",
        "import xgboost as xgb\n",
        "from catboost import CatBoostClassifier\n",
        "from lightgbm import LGBMClassifier\n",
        "from sklearn.tree import DecisionTreeClassifier\n",
        "from sklearn.ensemble import (ExtraTreesClassifier,\n",
        "                              GradientBoostingClassifier, AdaBoostClassifier)\n",
        "from sklearn.neighbors import (KNeighborsClassifier, RadiusNeighborsClassifier)\n",
        "from sklearn.mixture   import GaussianMixture\n",
        "from sklearn.decomposition import PCA"
      ],
      "metadata": {
        "id": "701sqYtkg7cP"
      },
      "execution_count": 40,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_emoticon_df = pd.read_csv('train_emoticon.csv')\n",
        "train_Y = train_emoticon_df['label']\n",
        "valid_emoticon_df = pd.read_csv('valid_emoticon.csv')\n",
        "valid_Y = valid_emoticon_df['label']\n",
        "test_emoticon_df = pd.read_csv('test_emoticon.csv')"
      ],
      "metadata": {
        "id": "kXzBBu0fg-5K"
      },
      "execution_count": 46,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def preprocess_emoticons(emoticons):\n",
        "    return [[c for c in emoticon] for emoticon in emoticons]"
      ],
      "metadata": {
        "id": "xRA-Fu2AhCqZ"
      },
      "execution_count": 36,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_emoticon_X = pd.DataFrame(preprocess_emoticons(train_emoticon_df['input_emoticon']))\n",
        "valid_emoticon_X = pd.DataFrame(preprocess_emoticons(valid_emoticon_df['input_emoticon']))"
      ],
      "metadata": {
        "id": "XQgfEwbulr-r"
      },
      "execution_count": 38,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "enc = OneHotEncoder(handle_unknown='ignore')\n",
        "enc.fit(train_emoticon_X)\n",
        "train_X = enc.transform(train_emoticon_X)\n",
        "valid_X = enc.transform(valid_emoticon_X)"
      ],
      "metadata": {
        "id": "DrSxtWkqshjP"
      },
      "execution_count": 42,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_X.shape"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "CFM-QRr5t67t",
        "outputId": "5e6a9ec7-dc26-49e9-a0e0-a1601cba72b7"
      },
      "execution_count": 44,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(7080, 2159)"
            ]
          },
          "metadata": {},
          "execution_count": 44
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def tune_and_evaluate(model, param_grid):\n",
        "    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n",
        "    grid_search.fit(train_X, train_Y)\n",
        "    best_model = grid_search.best_estimator_\n",
        "    valid_pred = best_model.predict(valid_X)\n",
        "    valid_acc = accuracy_score(valid_Y, valid_pred)\n",
        "    return grid_search.best_params_, valid_acc, best_model"
      ],
      "metadata": {
        "id": "6j8xhcXdhMtc"
      },
      "execution_count": 47,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "log_reg_param_grid = {\n",
        "    'C': [0.01, 0.1, 1, 10, 100],\n",
        "    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga']\n",
        "}\n",
        "log_reg = LogisticRegression(max_iter=5000)\n",
        "log_reg_params, log_reg_acc, best_log_reg = tune_and_evaluate(log_reg, log_reg_param_grid)"
      ],
      "metadata": {
        "id": "VwsHXfXRuN8y"
      },
      "execution_count": 48,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "log_reg_acc"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fo-IvhgbuXms",
        "outputId": "6c5904f6-71b6-48cf-dea2-dcbe1f430610"
      },
      "execution_count": 49,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.9161554192229039"
            ]
          },
          "metadata": {},
          "execution_count": 49
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.svm import SVC\n",
        "svm_param_grid = {\n",
        "    'C': [0.1, 1, 10],\n",
        "    'kernel': ['linear', 'rbf', 'poly']  }\n",
        "svm_model = SVC()\n",
        "svm_params, svm_acc, best_svm_model = tune_and_evaluate(svm_model, svm_param_grid)"
      ],
      "metadata": {
        "id": "It_znLNAvFVB"
      },
      "execution_count": 52,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "svm_acc"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "LOEf4Xuz2xPU",
        "outputId": "6f309260-aafe-4dc3-d94e-2aed3639ad23"
      },
      "execution_count": 53,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.8916155419222904"
            ]
          },
          "metadata": {},
          "execution_count": 53
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "knn_param_grid = {\n",
        "    'n_neighbors': [1,5,10,15],\n",
        "    'weights' : ['uniform','distance']\n",
        "}\n",
        "knn = KNeighborsClassifier()\n",
        "knn_params, knn_acc, knn_model = tune_and_evaluate(knn, knn_param_grid)"
      ],
      "metadata": {
        "id": "NvYdxC65hqS7"
      },
      "execution_count": 54,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(knn_acc)\n",
        "print(knn_params)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8QpSfVjuicCT",
        "outputId": "6f137773-ac06-4c88-d03e-13c75d1cf670"
      },
      "execution_count": 55,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0.5398773006134969\n",
            "{'n_neighbors': 15, 'weights': 'uniform'}\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "enn_param_grid = {\n",
        "    'radius' : [1.0, 5.0 , 10.0 ,15.0 ],\n",
        "    'weights' : ['uniform', 'distance']\n",
        "}\n",
        "enn = RadiusNeighborsClassifier()\n",
        "enn_params, enn_acc, enn_model = tune_and_evaluate(enn, enn_param_grid)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GGH4si_Dj0PQ",
        "outputId": "1403e9e3-ef55-4bbe-fd96-1d4ddc22d87b"
      },
      "execution_count": 56,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:1103: UserWarning: One or more of the test scores are non-finite: [       nan        nan 0.5079096  0.51016949 0.50508475 0.50508475\n",
            " 0.50508475 0.50508475]\n",
            "  warnings.warn(\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(enn_acc)\n",
        "print(enn_params)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jGwom8z8j0dL",
        "outputId": "66e497aa-4c72-440f-c54d-c4b599e3fd42"
      },
      "execution_count": 57,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0.5194274028629857\n",
            "{'radius': 5.0, 'weights': 'distance'}\n"
          ]
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyPAAz1LnYVeWXT+tSQXLxy6",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/110CodingP/fb791830a025e1f366424e0df217ea51/dataset1_knn_gaussian.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"id": "RQ4kD8nee0Nw"
	},
	"outputs": [],
	"source": [
	"! pip install -q kaggle"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from google.colab import files\n",
	"\n",
	"files.upload()"
	],
	"metadata": {
	"id": "0BFTikjYf-lY",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 92
	},
	"outputId": "9b4ead35-9a67-4fe2-ecb4-ee670805ceb8"
	},
	"execution_count": 2,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/plain": [
	"<IPython.core.display.HTML object>"
	],
	"text/html": [
	"\n",
	" <input type=\"file\" id=\"files-7765420d-275f-4a85-aa3f-ba7227ecc9a2\" name=\"files[]\" multiple disabled\n",
	" style=\"border:none\" />\n",
	" <output id=\"result-7765420d-275f-4a85-aa3f-ba7227ecc9a2\">\n",
	" Upload widget is only available when the cell has been executed in the\n",
	" current browser session. Please rerun this cell to enable.\n",
	" </output>\n",
	" <script>// Copyright 2017 Google LLC\n",
	"//\n",
	"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
	"// you may not use this file except in compliance with the License.\n",
	"// You may obtain a copy of the License at\n",
	"//\n",
	"// http://www.apache.org/licenses/LICENSE-2.0\n",
	"//\n",
	"// Unless required by applicable law or agreed to in writing, software\n",
	"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
	"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
	"// See the License for the specific language governing permissions and\n",
	"// limitations under the License.\n",
	"\n",
	"/**\n",
	" * @fileoverview Helpers for google.colab Python module.\n",
	" */\n",
	"(function(scope) {\n",
	"function span(text, styleAttributes = {}) {\n",
	" const element = document.createElement('span');\n",
	" element.textContent = text;\n",
	" for (const key of Object.keys(styleAttributes)) {\n",
	" element.style[key] = styleAttributes[key];\n",
	" }\n",
	" return element;\n",
	"}\n",
	"\n",
	"// Max number of bytes which will be uploaded at a time.\n",
	"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
	"\n",
	"function _uploadFiles(inputId, outputId) {\n",
	" const steps = uploadFilesStep(inputId, outputId);\n",
	" const outputElement = document.getElementById(outputId);\n",
	" // Cache steps on the outputElement to make it available for the next call\n",
	" // to uploadFilesContinue from Python.\n",
	" outputElement.steps = steps;\n",
	"\n",
	" return _uploadFilesContinue(outputId);\n",
	"}\n",
	"\n",
	"// This is roughly an async generator (not supported in the browser yet),\n",
	"// where there are multiple asynchronous steps and the Python side is going\n",
	"// to poll for completion of each step.\n",
	"// This uses a Promise to block the python side on completion of each step,\n",
	"// then passes the result of the previous step as the input to the next step.\n",
	"function _uploadFilesContinue(outputId) {\n",
	" const outputElement = document.getElementById(outputId);\n",
	" const steps = outputElement.steps;\n",
	"\n",
	" const next = steps.next(outputElement.lastPromiseValue);\n",
	" return Promise.resolve(next.value.promise).then((value) => {\n",
	" // Cache the last promise value to make it available to the next\n",
	" // step of the generator.\n",
	" outputElement.lastPromiseValue = value;\n",
	" return next.value.response;\n",
	" });\n",
	"}\n",
	"\n",
	"/**\n",
	" * Generator function which is called between each async step of the upload\n",
	" * process.\n",
	" * @param {string} inputId Element ID of the input file picker element.\n",
	" * @param {string} outputId Element ID of the output display.\n",
	" * @return {!Iterable<!Object>} Iterable of next steps.\n",
	" */\n",
	"function* uploadFilesStep(inputId, outputId) {\n",
	" const inputElement = document.getElementById(inputId);\n",
	" inputElement.disabled = false;\n",
	"\n",
	" const outputElement = document.getElementById(outputId);\n",
	" outputElement.innerHTML = '';\n",
	"\n",
	" const pickedPromise = new Promise((resolve) => {\n",
	" inputElement.addEventListener('change', (e) => {\n",
	" resolve(e.target.files);\n",
	" });\n",
	" });\n",
	"\n",
	" const cancel = document.createElement('button');\n",
	" inputElement.parentElement.appendChild(cancel);\n",
	" cancel.textContent = 'Cancel upload';\n",
	" const cancelPromise = new Promise((resolve) => {\n",
	" cancel.onclick = () => {\n",
	" resolve(null);\n",
	" };\n",
	" });\n",
	"\n",
	" // Wait for the user to pick the files.\n",
	" const files = yield {\n",
	" promise: Promise.race([pickedPromise, cancelPromise]),\n",
	" response: {\n",
	" action: 'starting',\n",
	" }\n",
	" };\n",
	"\n",
	" cancel.remove();\n",
	"\n",
	" // Disable the input element since further picks are not allowed.\n",
	" inputElement.disabled = true;\n",
	"\n",
	" if (!files) {\n",
	" return {\n",
	" response: {\n",
	" action: 'complete',\n",
	" }\n",
	" };\n",
	" }\n",
	"\n",
	" for (const file of files) {\n",
	" const li = document.createElement('li');\n",
	" li.append(span(file.name, {fontWeight: 'bold'}));\n",
	" li.append(span(\n",
	" `(${file.type \|\| 'n/a'}) - ${file.size} bytes, ` +\n",
	" `last modified: ${\n",
	" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
	" 'n/a'} - `));\n",
	" const percent = span('0% done');\n",
	" li.appendChild(percent);\n",
	"\n",
	" outputElement.appendChild(li);\n",
	"\n",
	" const fileDataPromise = new Promise((resolve) => {\n",
	" const reader = new FileReader();\n",
	" reader.onload = (e) => {\n",
	" resolve(e.target.result);\n",
	" };\n",
	" reader.readAsArrayBuffer(file);\n",
	" });\n",
	" // Wait for the data to be ready.\n",
	" let fileData = yield {\n",
	" promise: fileDataPromise,\n",
	" response: {\n",
	" action: 'continue',\n",
	" }\n",
	" };\n",
	"\n",
	" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
	" let position = 0;\n",
	" do {\n",
	" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
	" const chunk = new Uint8Array(fileData, position, length);\n",
	" position += length;\n",
	"\n",
	" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
	" yield {\n",
	" response: {\n",
	" action: 'append',\n",
	" file: file.name,\n",
	" data: base64,\n",
	" },\n",
	" };\n",
	"\n",
	" let percentDone = fileData.byteLength === 0 ?\n",
	" 100 :\n",
	" Math.round((position / fileData.byteLength) * 100);\n",
	" percent.textContent = `${percentDone}% done`;\n",
	"\n",
	" } while (position < fileData.byteLength);\n",
	" }\n",
	"\n",
	" // All done.\n",
	" yield {\n",
	" response: {\n",
	" action: 'complete',\n",
	" }\n",
	" };\n",
	"}\n",
	"\n",
	"scope.google = scope.google \|\| {};\n",
	"scope.google.colab = scope.google.colab \|\| {};\n",
	"scope.google.colab._files = {\n",
	" _uploadFiles,\n",
	" _uploadFilesContinue,\n",
	"};\n",
	"})(self);\n",
	"</script> "
	]
	},
	"metadata": {}
	},
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Saving kaggle.json to kaggle.json\n"
	]
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"{'kaggle.json': b'{\"username\":\"codingp110\",\"key\":\"81f210dea3939d586d081537b5076f96\"}'}"
	]
	},
	"metadata": {},
	"execution_count": 2
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"! mkdir ~/.kaggle\n",
	"\n",
	"! cp kaggle.json ~/.kaggle/"
	],
	"metadata": {
	"id": "yMXfjphbgGxV"
	},
	"execution_count": 3,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"! chmod 600 ~/.kaggle/kaggle.json"
	],
	"metadata": {
	"id": "l9PCFxiYgJBB"
	},
	"execution_count": 4,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"! kaggle datasets download codingp110/emoticons"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "qzCwlx1wgLn6",
	"outputId": "5e63512a-29b6-469e-e313-494cc18dc7b0"
	},
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Dataset URL: https://www.kaggle.com/datasets/codingp110/emoticons\n",
	"License(s): unknown\n",
	"Downloading emoticons.zip to /content\n",
	"100% 133k/133k [00:00<00:00, 415kB/s]\n",
	"100% 133k/133k [00:00<00:00, 415kB/s]\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"! unzip /content/emoticons.zip"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "g0T-GLsLgu5b",
	"outputId": "818d8859-a103-42dd-d7a1-df65e21150fd"
	},
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Archive: /content/emoticons.zip\n",
	" inflating: test_emoticon.csv \n",
	" inflating: train_emoticon.csv \n",
	" inflating: valid_emoticon.csv \n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"! pip install catboost"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Uz4ds1crg0Ln",
	"outputId": "03830e62-474a-4879-a252-88bacece6f63"
	},
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Collecting catboost\n",
	" Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)\n",
	"Requirement already satisfied: graphviz in /usr/local/lib/python3.10/dist-packages (from catboost) (0.20.3)\n",
	"Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from catboost) (3.7.1)\n",
	"Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from catboost) (1.26.4)\n",
	"Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.10/dist-packages (from catboost) (2.2.2)\n",
	"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from catboost) (1.13.1)\n",
	"Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (from catboost) (5.24.1)\n",
	"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from catboost) (1.16.0)\n",
	"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2.8.2)\n",
	"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)\n",
	"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)\n",
	"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.3.0)\n",
	"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (0.12.1)\n",
	"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (4.54.1)\n",
	"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.4.7)\n",
	"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (24.1)\n",
	"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (10.4.0)\n",
	"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (3.1.4)\n",
	"Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly->catboost) (9.0.0)\n",
	"Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 MB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25hInstalling collected packages: catboost\n",
	"Successfully installed catboost-1.2.7\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.ensemble import RandomForestClassifier\n",
	"from sklearn.model_selection import train_test_split, GridSearchCV\n",
	"from sklearn.preprocessing import (StandardScaler, OneHotEncoder)\n",
	"from sklearn.metrics import accuracy_score\n",
	"import xgboost as xgb\n",
	"from catboost import CatBoostClassifier\n",
	"from lightgbm import LGBMClassifier\n",
	"from sklearn.tree import DecisionTreeClassifier\n",
	"from sklearn.ensemble import (ExtraTreesClassifier,\n",
	" GradientBoostingClassifier, AdaBoostClassifier)\n",
	"from sklearn.neighbors import (KNeighborsClassifier, RadiusNeighborsClassifier)\n",
	"from sklearn.mixture import GaussianMixture\n",
	"from sklearn.decomposition import PCA"
	],
	"metadata": {
	"id": "701sqYtkg7cP"
	},
	"execution_count": 40,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"train_emoticon_df = pd.read_csv('train_emoticon.csv')\n",
	"train_Y = train_emoticon_df['label']\n",
	"valid_emoticon_df = pd.read_csv('valid_emoticon.csv')\n",
	"valid_Y = valid_emoticon_df['label']\n",
	"test_emoticon_df = pd.read_csv('test_emoticon.csv')"
	],
	"metadata": {
	"id": "kXzBBu0fg-5K"
	},
	"execution_count": 46,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"def preprocess_emoticons(emoticons):\n",
	" return [[c for c in emoticon] for emoticon in emoticons]"
	],
	"metadata": {
	"id": "xRA-Fu2AhCqZ"
	},
	"execution_count": 36,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"train_emoticon_X = pd.DataFrame(preprocess_emoticons(train_emoticon_df['input_emoticon']))\n",
	"valid_emoticon_X = pd.DataFrame(preprocess_emoticons(valid_emoticon_df['input_emoticon']))"
	],
	"metadata": {
	"id": "XQgfEwbulr-r"
	},
	"execution_count": 38,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"enc = OneHotEncoder(handle_unknown='ignore')\n",
	"enc.fit(train_emoticon_X)\n",
	"train_X = enc.transform(train_emoticon_X)\n",
	"valid_X = enc.transform(valid_emoticon_X)"
	],
	"metadata": {
	"id": "DrSxtWkqshjP"
	},
	"execution_count": 42,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"train_X.shape"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "CFM-QRr5t67t",
	"outputId": "5e6a9ec7-dc26-49e9-a0e0-a1601cba72b7"
	},
	"execution_count": 44,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(7080, 2159)"
	]
	},
	"metadata": {},
	"execution_count": 44
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"def tune_and_evaluate(model, param_grid):\n",
	" grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n",
	" grid_search.fit(train_X, train_Y)\n",
	" best_model = grid_search.best_estimator_\n",
	" valid_pred = best_model.predict(valid_X)\n",
	" valid_acc = accuracy_score(valid_Y, valid_pred)\n",
	" return grid_search.best_params_, valid_acc, best_model"
	],
	"metadata": {
	"id": "6j8xhcXdhMtc"
	},
	"execution_count": 47,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"log_reg_param_grid = {\n",
	" 'C': [0.01, 0.1, 1, 10, 100],\n",
	" 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga']\n",
	"}\n",
	"log_reg = LogisticRegression(max_iter=5000)\n",
	"log_reg_params, log_reg_acc, best_log_reg = tune_and_evaluate(log_reg, log_reg_param_grid)"
	],
	"metadata": {
	"id": "VwsHXfXRuN8y"
	},
	"execution_count": 48,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"log_reg_acc"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "fo-IvhgbuXms",
	"outputId": "6c5904f6-71b6-48cf-dea2-dcbe1f430610"
	},
	"execution_count": 49,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"0.9161554192229039"
	]
	},
	"metadata": {},
	"execution_count": 49
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from sklearn.svm import SVC\n",
	"svm_param_grid = {\n",
	" 'C': [0.1, 1, 10],\n",
	" 'kernel': ['linear', 'rbf', 'poly'] }\n",
	"svm_model = SVC()\n",
	"svm_params, svm_acc, best_svm_model = tune_and_evaluate(svm_model, svm_param_grid)"
	],
	"metadata": {
	"id": "It_znLNAvFVB"
	},
	"execution_count": 52,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"svm_acc"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "LOEf4Xuz2xPU",
	"outputId": "6f309260-aafe-4dc3-d94e-2aed3639ad23"
	},
	"execution_count": 53,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"0.8916155419222904"
	]
	},
	"metadata": {},
	"execution_count": 53
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"knn_param_grid = {\n",
	" 'n_neighbors': [1,5,10,15],\n",
	" 'weights' : ['uniform','distance']\n",
	"}\n",
	"knn = KNeighborsClassifier()\n",
	"knn_params, knn_acc, knn_model = tune_and_evaluate(knn, knn_param_grid)"
	],
	"metadata": {
	"id": "NvYdxC65hqS7"
	},
	"execution_count": 54,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"print(knn_acc)\n",
	"print(knn_params)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "8QpSfVjuicCT",
	"outputId": "6f137773-ac06-4c88-d03e-13c75d1cf670"
	},
	"execution_count": 55,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"0.5398773006134969\n",
	"{'n_neighbors': 15, 'weights': 'uniform'}\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"enn_param_grid = {\n",
	" 'radius' : [1.0, 5.0 , 10.0 ,15.0 ],\n",
	" 'weights' : ['uniform', 'distance']\n",
	"}\n",
	"enn = RadiusNeighborsClassifier()\n",
	"enn_params, enn_acc, enn_model = tune_and_evaluate(enn, enn_param_grid)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "GGH4si_Dj0PQ",
	"outputId": "1403e9e3-ef55-4bbe-fd96-1d4ddc22d87b"
	},
	"execution_count": 56,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stderr",
	"text": [
	"/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan 0.5079096 0.51016949 0.50508475 0.50508475\n",
	" 0.50508475 0.50508475]\n",
	" warnings.warn(\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"print(enn_acc)\n",
	"print(enn_params)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "jGwom8z8j0dL",
	"outputId": "66e497aa-4c72-440f-c54d-c4b599e3fd42"
	},
	"execution_count": 57,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"0.5194274028629857\n",
	"{'radius': 5.0, 'weights': 'distance'}\n"
	]
	}
	]
	}
	]
	}