gangtao · March 5, 2019 23:24
diff --git a/One Hot Encoding.ipynb b/One Hot Encoding.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Manual One Hot Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hello world\n",
      "[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]\n",
      "[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
      "h\n",
      "e\n",
      "l\n",
      "l\n",
      "o\n",
      " \n",
      "w\n",
      "o\n",
      "r\n",
      "l\n",
      "d\n"
     ]
    }
   ],
   "source": [
    "from numpy import argmax\n",
    "# define input string\n",
    "data = 'hello world'\n",
    "print(data)\n",
    "# define universe of possible input values\n",
    "alphabet = 'abcdefghijklmnopqrstuvwxyz '\n",
    "# define a mapping of chars to integers\n",
    "char_to_int = dict((c, i) for i, c in enumerate(alphabet))\n",
    "int_to_char = dict((i, c) for i, c in enumerate(alphabet))\n",
    "# integer encode input data\n",
    "integer_encoded = [char_to_int[char] for char in data]\n",
    "print(integer_encoded)\n",
    "# one hot encode\n",
    "onehot_encoded = list()\n",
    "for value in integer_encoded:\n",
    "\tletter = [0 for _ in range(len(alphabet))]\n",
    "\tletter[value] = 1\n",
    "\tonehot_encoded.append(letter)\n",
    "print(onehot_encoded)\n",
    "# invert encoding\n",
    "for i in range(len(data)):\n",
    "    inverted = int_to_char[argmax(onehot_encoded[i])]\n",
    "    print(inverted)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One Hot Encode with scikit-learn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']\n",
      "[0 0 2 0 1 1 2 0 2 1]\n",
      "[[1. 0. 0.]\n",
      " [1. 0. 0.]\n",
      " [0. 0. 1.]\n",
      " [1. 0. 0.]\n",
      " [0. 1. 0.]\n",
      " [0. 1. 0.]\n",
      " [0. 0. 1.]\n",
      " [1. 0. 0.]\n",
      " [0. 0. 1.]\n",
      " [0. 1. 0.]]\n",
      "['cold']\n",
      "['cold']\n",
      "['warm']\n",
      "['cold']\n",
      "['hot']\n",
      "['hot']\n",
      "['warm']\n",
      "['cold']\n",
      "['warm']\n",
      "['hot']\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n",
      "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    }
   ],
   "source": [
    "from numpy import array\n",
    "from numpy import argmax\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "# define example\n",
    "data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']\n",
    "values = array(data)\n",
    "print(values)\n",
    "# integer encode\n",
    "label_encoder = LabelEncoder()\n",
    "integer_encoded = label_encoder.fit_transform(values)\n",
    "print(integer_encoded)\n",
    "# binary encode\n",
    "onehot_encoder = OneHotEncoder(sparse=False)\n",
    "integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)\n",
    "onehot_encoded = onehot_encoder.fit_transform(integer_encoded)\n",
    "print(onehot_encoded)\n",
    "# invert first example\n",
    "for i in range(len(data)):\n",
    "    inverted = label_encoder.inverse_transform([argmax(onehot_encoded[i, :])])\n",
    "    print(inverted)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One Hot Encode with Keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 3 2 0 3 2 2 1 0 1]\n",
      "[[0. 1. 0. 0.]\n",
      " [0. 0. 0. 1.]\n",
      " [0. 0. 1. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [0. 0. 0. 1.]\n",
      " [0. 0. 1. 0.]\n",
      " [0. 0. 1. 0.]\n",
      " [0. 1. 0. 0.]\n",
      " [1. 0. 0. 0.]\n",
      " [0. 1. 0. 0.]]\n",
      "1\n"
     ]
    }
   ],
   "source": [
    "from numpy import array\n",
    "from numpy import argmax\n",
    "from keras.utils import to_categorical\n",
    "# define example\n",
    "data = [1, 3, 2, 0, 3, 2, 2, 1, 0, 1]\n",
    "data = array(data)\n",
    "print(data)\n",
    "# one hot encode\n",
    "encoded = to_categorical(data)\n",
    "print(encoded)\n",
    "# invert encoding\n",
    "inverted = argmax(encoded[0])\n",
    "print(inverted)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Manual One Hot Encoding"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"hello world\n",
	"[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]\n",
	"[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]\n",
	"h\n",
	"e\n",
	"l\n",
	"l\n",
	"o\n",
	" \n",
	"w\n",
	"o\n",
	"r\n",
	"l\n",
	"d\n"
	]
	}
	],
	"source": [
	"from numpy import argmax\n",
	"# define input string\n",
	"data = 'hello world'\n",
	"print(data)\n",
	"# define universe of possible input values\n",
	"alphabet = 'abcdefghijklmnopqrstuvwxyz '\n",
	"# define a mapping of chars to integers\n",
	"char_to_int = dict((c, i) for i, c in enumerate(alphabet))\n",
	"int_to_char = dict((i, c) for i, c in enumerate(alphabet))\n",
	"# integer encode input data\n",
	"integer_encoded = [char_to_int[char] for char in data]\n",
	"print(integer_encoded)\n",
	"# one hot encode\n",
	"onehot_encoded = list()\n",
	"for value in integer_encoded:\n",
	"\tletter = [0 for _ in range(len(alphabet))]\n",
	"\tletter[value] = 1\n",
	"\tonehot_encoded.append(letter)\n",
	"print(onehot_encoded)\n",
	"# invert encoding\n",
	"for i in range(len(data)):\n",
	" inverted = int_to_char[argmax(onehot_encoded[i])]\n",
	" print(inverted)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"One Hot Encode with scikit-learn"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['cold' 'cold' 'warm' 'cold' 'hot' 'hot' 'warm' 'cold' 'warm' 'hot']\n",
	"[0 0 2 0 1 1 2 0 2 1]\n",
	"[[1. 0. 0.]\n",
	" [1. 0. 0.]\n",
	" [0. 0. 1.]\n",
	" [1. 0. 0.]\n",
	" [0. 1. 0.]\n",
	" [0. 1. 0.]\n",
	" [0. 0. 1.]\n",
	" [1. 0. 0.]\n",
	" [0. 0. 1.]\n",
	" [0. 1. 0.]]\n",
	"['cold']\n",
	"['cold']\n",
	"['warm']\n",
	"['cold']\n",
	"['hot']\n",
	"['hot']\n",
	"['warm']\n",
	"['cold']\n",
	"['warm']\n",
	"['hot']\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n",
	"/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
	" if diff:\n"
	]
	}
	],
	"source": [
	"from numpy import array\n",
	"from numpy import argmax\n",
	"from sklearn.preprocessing import LabelEncoder\n",
	"from sklearn.preprocessing import OneHotEncoder\n",
	"# define example\n",
	"data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']\n",
	"values = array(data)\n",
	"print(values)\n",
	"# integer encode\n",
	"label_encoder = LabelEncoder()\n",
	"integer_encoded = label_encoder.fit_transform(values)\n",
	"print(integer_encoded)\n",
	"# binary encode\n",
	"onehot_encoder = OneHotEncoder(sparse=False)\n",
	"integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)\n",
	"onehot_encoded = onehot_encoder.fit_transform(integer_encoded)\n",
	"print(onehot_encoded)\n",
	"# invert first example\n",
	"for i in range(len(data)):\n",
	" inverted = label_encoder.inverse_transform([argmax(onehot_encoded[i, :])])\n",
	" print(inverted)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"One Hot Encode with Keras"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Using TensorFlow backend.\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[1 3 2 0 3 2 2 1 0 1]\n",
	"[[0. 1. 0. 0.]\n",
	" [0. 0. 0. 1.]\n",
	" [0. 0. 1. 0.]\n",
	" [1. 0. 0. 0.]\n",
	" [0. 0. 0. 1.]\n",
	" [0. 0. 1. 0.]\n",
	" [0. 0. 1. 0.]\n",
	" [0. 1. 0. 0.]\n",
	" [1. 0. 0. 0.]\n",
	" [0. 1. 0. 0.]]\n",
	"1\n"
	]
	}
	],
	"source": [
	"from numpy import array\n",
	"from numpy import argmax\n",
	"from keras.utils import to_categorical\n",
	"# define example\n",
	"data = [1, 3, 2, 0, 3, 2, 2, 1, 0, 1]\n",
	"data = array(data)\n",
	"print(data)\n",
	"# one hot encode\n",
	"encoded = to_categorical(data)\n",
	"print(encoded)\n",
	"# invert encoding\n",
	"inverted = argmax(encoded[0])\n",
	"print(inverted)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}