Skip to content

Instantly share code, notes, and snippets.

@110CodingP
Created October 16, 2024 15:12
Show Gist options
  • Save 110CodingP/fb791830a025e1f366424e0df217ea51 to your computer and use it in GitHub Desktop.
Save 110CodingP/fb791830a025e1f366424e0df217ea51 to your computer and use it in GitHub Desktop.
dataset1_knn_gaussian.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyPAAz1LnYVeWXT+tSQXLxy6",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/110CodingP/fb791830a025e1f366424e0df217ea51/dataset1_knn_gaussian.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "RQ4kD8nee0Nw"
},
"outputs": [],
"source": [
"! pip install -q kaggle"
]
},
{
"cell_type": "code",
"source": [
"from google.colab import files\n",
"\n",
"files.upload()"
],
"metadata": {
"id": "0BFTikjYf-lY",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 92
},
"outputId": "9b4ead35-9a67-4fe2-ecb4-ee670805ceb8"
},
"execution_count": 2,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"\n",
" <input type=\"file\" id=\"files-7765420d-275f-4a85-aa3f-ba7227ecc9a2\" name=\"files[]\" multiple disabled\n",
" style=\"border:none\" />\n",
" <output id=\"result-7765420d-275f-4a85-aa3f-ba7227ecc9a2\">\n",
" Upload widget is only available when the cell has been executed in the\n",
" current browser session. Please rerun this cell to enable.\n",
" </output>\n",
" <script>// Copyright 2017 Google LLC\n",
"//\n",
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"// you may not use this file except in compliance with the License.\n",
"// You may obtain a copy of the License at\n",
"//\n",
"// http://www.apache.org/licenses/LICENSE-2.0\n",
"//\n",
"// Unless required by applicable law or agreed to in writing, software\n",
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"// See the License for the specific language governing permissions and\n",
"// limitations under the License.\n",
"\n",
"/**\n",
" * @fileoverview Helpers for google.colab Python module.\n",
" */\n",
"(function(scope) {\n",
"function span(text, styleAttributes = {}) {\n",
" const element = document.createElement('span');\n",
" element.textContent = text;\n",
" for (const key of Object.keys(styleAttributes)) {\n",
" element.style[key] = styleAttributes[key];\n",
" }\n",
" return element;\n",
"}\n",
"\n",
"// Max number of bytes which will be uploaded at a time.\n",
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
"\n",
"function _uploadFiles(inputId, outputId) {\n",
" const steps = uploadFilesStep(inputId, outputId);\n",
" const outputElement = document.getElementById(outputId);\n",
" // Cache steps on the outputElement to make it available for the next call\n",
" // to uploadFilesContinue from Python.\n",
" outputElement.steps = steps;\n",
"\n",
" return _uploadFilesContinue(outputId);\n",
"}\n",
"\n",
"// This is roughly an async generator (not supported in the browser yet),\n",
"// where there are multiple asynchronous steps and the Python side is going\n",
"// to poll for completion of each step.\n",
"// This uses a Promise to block the python side on completion of each step,\n",
"// then passes the result of the previous step as the input to the next step.\n",
"function _uploadFilesContinue(outputId) {\n",
" const outputElement = document.getElementById(outputId);\n",
" const steps = outputElement.steps;\n",
"\n",
" const next = steps.next(outputElement.lastPromiseValue);\n",
" return Promise.resolve(next.value.promise).then((value) => {\n",
" // Cache the last promise value to make it available to the next\n",
" // step of the generator.\n",
" outputElement.lastPromiseValue = value;\n",
" return next.value.response;\n",
" });\n",
"}\n",
"\n",
"/**\n",
" * Generator function which is called between each async step of the upload\n",
" * process.\n",
" * @param {string} inputId Element ID of the input file picker element.\n",
" * @param {string} outputId Element ID of the output display.\n",
" * @return {!Iterable<!Object>} Iterable of next steps.\n",
" */\n",
"function* uploadFilesStep(inputId, outputId) {\n",
" const inputElement = document.getElementById(inputId);\n",
" inputElement.disabled = false;\n",
"\n",
" const outputElement = document.getElementById(outputId);\n",
" outputElement.innerHTML = '';\n",
"\n",
" const pickedPromise = new Promise((resolve) => {\n",
" inputElement.addEventListener('change', (e) => {\n",
" resolve(e.target.files);\n",
" });\n",
" });\n",
"\n",
" const cancel = document.createElement('button');\n",
" inputElement.parentElement.appendChild(cancel);\n",
" cancel.textContent = 'Cancel upload';\n",
" const cancelPromise = new Promise((resolve) => {\n",
" cancel.onclick = () => {\n",
" resolve(null);\n",
" };\n",
" });\n",
"\n",
" // Wait for the user to pick the files.\n",
" const files = yield {\n",
" promise: Promise.race([pickedPromise, cancelPromise]),\n",
" response: {\n",
" action: 'starting',\n",
" }\n",
" };\n",
"\n",
" cancel.remove();\n",
"\n",
" // Disable the input element since further picks are not allowed.\n",
" inputElement.disabled = true;\n",
"\n",
" if (!files) {\n",
" return {\n",
" response: {\n",
" action: 'complete',\n",
" }\n",
" };\n",
" }\n",
"\n",
" for (const file of files) {\n",
" const li = document.createElement('li');\n",
" li.append(span(file.name, {fontWeight: 'bold'}));\n",
" li.append(span(\n",
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
" `last modified: ${\n",
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
" 'n/a'} - `));\n",
" const percent = span('0% done');\n",
" li.appendChild(percent);\n",
"\n",
" outputElement.appendChild(li);\n",
"\n",
" const fileDataPromise = new Promise((resolve) => {\n",
" const reader = new FileReader();\n",
" reader.onload = (e) => {\n",
" resolve(e.target.result);\n",
" };\n",
" reader.readAsArrayBuffer(file);\n",
" });\n",
" // Wait for the data to be ready.\n",
" let fileData = yield {\n",
" promise: fileDataPromise,\n",
" response: {\n",
" action: 'continue',\n",
" }\n",
" };\n",
"\n",
" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
" let position = 0;\n",
" do {\n",
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
" const chunk = new Uint8Array(fileData, position, length);\n",
" position += length;\n",
"\n",
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
" yield {\n",
" response: {\n",
" action: 'append',\n",
" file: file.name,\n",
" data: base64,\n",
" },\n",
" };\n",
"\n",
" let percentDone = fileData.byteLength === 0 ?\n",
" 100 :\n",
" Math.round((position / fileData.byteLength) * 100);\n",
" percent.textContent = `${percentDone}% done`;\n",
"\n",
" } while (position < fileData.byteLength);\n",
" }\n",
"\n",
" // All done.\n",
" yield {\n",
" response: {\n",
" action: 'complete',\n",
" }\n",
" };\n",
"}\n",
"\n",
"scope.google = scope.google || {};\n",
"scope.google.colab = scope.google.colab || {};\n",
"scope.google.colab._files = {\n",
" _uploadFiles,\n",
" _uploadFilesContinue,\n",
"};\n",
"})(self);\n",
"</script> "
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saving kaggle.json to kaggle.json\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'kaggle.json': b'{\"username\":\"codingp110\",\"key\":\"81f210dea3939d586d081537b5076f96\"}'}"
]
},
"metadata": {},
"execution_count": 2
}
]
},
{
"cell_type": "code",
"source": [
"! mkdir ~/.kaggle\n",
"\n",
"! cp kaggle.json ~/.kaggle/"
],
"metadata": {
"id": "yMXfjphbgGxV"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"! chmod 600 ~/.kaggle/kaggle.json"
],
"metadata": {
"id": "l9PCFxiYgJBB"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "code",
"source": [
"! kaggle datasets download codingp110/emoticons"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qzCwlx1wgLn6",
"outputId": "5e63512a-29b6-469e-e313-494cc18dc7b0"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Dataset URL: https://www.kaggle.com/datasets/codingp110/emoticons\n",
"License(s): unknown\n",
"Downloading emoticons.zip to /content\n",
"100% 133k/133k [00:00<00:00, 415kB/s]\n",
"100% 133k/133k [00:00<00:00, 415kB/s]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"! unzip /content/emoticons.zip"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "g0T-GLsLgu5b",
"outputId": "818d8859-a103-42dd-d7a1-df65e21150fd"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Archive: /content/emoticons.zip\n",
" inflating: test_emoticon.csv \n",
" inflating: train_emoticon.csv \n",
" inflating: valid_emoticon.csv \n"
]
}
]
},
{
"cell_type": "code",
"source": [
"! pip install catboost"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Uz4ds1crg0Ln",
"outputId": "03830e62-474a-4879-a252-88bacece6f63"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting catboost\n",
" Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)\n",
"Requirement already satisfied: graphviz in /usr/local/lib/python3.10/dist-packages (from catboost) (0.20.3)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from catboost) (3.7.1)\n",
"Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.10/dist-packages (from catboost) (1.26.4)\n",
"Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.10/dist-packages (from catboost) (2.2.2)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from catboost) (1.13.1)\n",
"Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (from catboost) (5.24.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from catboost) (1.16.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas>=0.24->catboost) (2024.2)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.3.0)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (4.54.1)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (1.4.7)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (24.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (10.4.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->catboost) (3.1.4)\n",
"Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly->catboost) (9.0.0)\n",
"Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 MB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: catboost\n",
"Successfully installed catboost-1.2.7\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.preprocessing import (StandardScaler, OneHotEncoder)\n",
"from sklearn.metrics import accuracy_score\n",
"import xgboost as xgb\n",
"from catboost import CatBoostClassifier\n",
"from lightgbm import LGBMClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import (ExtraTreesClassifier,\n",
" GradientBoostingClassifier, AdaBoostClassifier)\n",
"from sklearn.neighbors import (KNeighborsClassifier, RadiusNeighborsClassifier)\n",
"from sklearn.mixture import GaussianMixture\n",
"from sklearn.decomposition import PCA"
],
"metadata": {
"id": "701sqYtkg7cP"
},
"execution_count": 40,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_emoticon_df = pd.read_csv('train_emoticon.csv')\n",
"train_Y = train_emoticon_df['label']\n",
"valid_emoticon_df = pd.read_csv('valid_emoticon.csv')\n",
"valid_Y = valid_emoticon_df['label']\n",
"test_emoticon_df = pd.read_csv('test_emoticon.csv')"
],
"metadata": {
"id": "kXzBBu0fg-5K"
},
"execution_count": 46,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def preprocess_emoticons(emoticons):\n",
" return [[c for c in emoticon] for emoticon in emoticons]"
],
"metadata": {
"id": "xRA-Fu2AhCqZ"
},
"execution_count": 36,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_emoticon_X = pd.DataFrame(preprocess_emoticons(train_emoticon_df['input_emoticon']))\n",
"valid_emoticon_X = pd.DataFrame(preprocess_emoticons(valid_emoticon_df['input_emoticon']))"
],
"metadata": {
"id": "XQgfEwbulr-r"
},
"execution_count": 38,
"outputs": []
},
{
"cell_type": "code",
"source": [
"enc = OneHotEncoder(handle_unknown='ignore')\n",
"enc.fit(train_emoticon_X)\n",
"train_X = enc.transform(train_emoticon_X)\n",
"valid_X = enc.transform(valid_emoticon_X)"
],
"metadata": {
"id": "DrSxtWkqshjP"
},
"execution_count": 42,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_X.shape"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CFM-QRr5t67t",
"outputId": "5e6a9ec7-dc26-49e9-a0e0-a1601cba72b7"
},
"execution_count": 44,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(7080, 2159)"
]
},
"metadata": {},
"execution_count": 44
}
]
},
{
"cell_type": "code",
"source": [
"def tune_and_evaluate(model, param_grid):\n",
" grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)\n",
" grid_search.fit(train_X, train_Y)\n",
" best_model = grid_search.best_estimator_\n",
" valid_pred = best_model.predict(valid_X)\n",
" valid_acc = accuracy_score(valid_Y, valid_pred)\n",
" return grid_search.best_params_, valid_acc, best_model"
],
"metadata": {
"id": "6j8xhcXdhMtc"
},
"execution_count": 47,
"outputs": []
},
{
"cell_type": "code",
"source": [
"log_reg_param_grid = {\n",
" 'C': [0.01, 0.1, 1, 10, 100],\n",
" 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga']\n",
"}\n",
"log_reg = LogisticRegression(max_iter=5000)\n",
"log_reg_params, log_reg_acc, best_log_reg = tune_and_evaluate(log_reg, log_reg_param_grid)"
],
"metadata": {
"id": "VwsHXfXRuN8y"
},
"execution_count": 48,
"outputs": []
},
{
"cell_type": "code",
"source": [
"log_reg_acc"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fo-IvhgbuXms",
"outputId": "6c5904f6-71b6-48cf-dea2-dcbe1f430610"
},
"execution_count": 49,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.9161554192229039"
]
},
"metadata": {},
"execution_count": 49
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.svm import SVC\n",
"svm_param_grid = {\n",
" 'C': [0.1, 1, 10],\n",
" 'kernel': ['linear', 'rbf', 'poly'] }\n",
"svm_model = SVC()\n",
"svm_params, svm_acc, best_svm_model = tune_and_evaluate(svm_model, svm_param_grid)"
],
"metadata": {
"id": "It_znLNAvFVB"
},
"execution_count": 52,
"outputs": []
},
{
"cell_type": "code",
"source": [
"svm_acc"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LOEf4Xuz2xPU",
"outputId": "6f309260-aafe-4dc3-d94e-2aed3639ad23"
},
"execution_count": 53,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.8916155419222904"
]
},
"metadata": {},
"execution_count": 53
}
]
},
{
"cell_type": "code",
"source": [
"knn_param_grid = {\n",
" 'n_neighbors': [1,5,10,15],\n",
" 'weights' : ['uniform','distance']\n",
"}\n",
"knn = KNeighborsClassifier()\n",
"knn_params, knn_acc, knn_model = tune_and_evaluate(knn, knn_param_grid)"
],
"metadata": {
"id": "NvYdxC65hqS7"
},
"execution_count": 54,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(knn_acc)\n",
"print(knn_params)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8QpSfVjuicCT",
"outputId": "6f137773-ac06-4c88-d03e-13c75d1cf670"
},
"execution_count": 55,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0.5398773006134969\n",
"{'n_neighbors': 15, 'weights': 'uniform'}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"enn_param_grid = {\n",
" 'radius' : [1.0, 5.0 , 10.0 ,15.0 ],\n",
" 'weights' : ['uniform', 'distance']\n",
"}\n",
"enn = RadiusNeighborsClassifier()\n",
"enn_params, enn_acc, enn_model = tune_and_evaluate(enn, enn_param_grid)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GGH4si_Dj0PQ",
"outputId": "1403e9e3-ef55-4bbe-fd96-1d4ddc22d87b"
},
"execution_count": 56,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:1103: UserWarning: One or more of the test scores are non-finite: [ nan nan 0.5079096 0.51016949 0.50508475 0.50508475\n",
" 0.50508475 0.50508475]\n",
" warnings.warn(\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(enn_acc)\n",
"print(enn_params)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jGwom8z8j0dL",
"outputId": "66e497aa-4c72-440f-c54d-c4b599e3fd42"
},
"execution_count": 57,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0.5194274028629857\n",
"{'radius': 5.0, 'weights': 'distance'}\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment