-
-
Save laresbernardo/d38ab3960a5ac7230ff2d9f2a762d43b to your computer and use it in GitHub Desktop.
numpy and pandas example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "import pandas as pd # Import, manipulate, export data (DataFrames)\nimport numpy as np # Mathematical and matrix operations\nimport os # Set working directory", | |
"execution_count": 27, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Working directory\nos.chdir('/Users/bernardo/Dropbox (Personal)/Documentos/Python/Data Mining/')", | |
"execution_count": 28, | |
"outputs": [] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Import CSV or Excel file" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df = pd.read_csv(\"05. Classification Concepts/5052_05_Code/anes_dataset.csv\")\n#df = pd.read_excel(path)\ndf.head(3)", | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 29, | |
"data": { | |
"text/plain": " popul TVnews selfLR ClinLR DoleLR PID age educ income vote\n0 0 7 7 1 6 6 36 3 1 1\n1 190 1 3 3 5 1 20 4 1 0\n2 31 7 2 2 6 1 24 6 1 0", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>popul</th>\n <th>TVnews</th>\n <th>selfLR</th>\n <th>ClinLR</th>\n <th>DoleLR</th>\n <th>PID</th>\n <th>age</th>\n <th>educ</th>\n <th>income</th>\n <th>vote</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>7</td>\n <td>7</td>\n <td>1</td>\n <td>6</td>\n <td>6</td>\n <td>36</td>\n <td>3</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>190</td>\n <td>1</td>\n <td>3</td>\n <td>3</td>\n <td>5</td>\n <td>1</td>\n <td>20</td>\n <td>4</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>31</td>\n <td>7</td>\n <td>2</td>\n <td>2</td>\n <td>6</td>\n <td>1</td>\n <td>24</td>\n <td>6</td>\n <td>1</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Matrix and Dictionary manual creation" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Create a manual dictionary\nd1 = {'Nombe': pd.Series(['Tomas','Jaime','Ricardo','Victor','Esteban','Susana','Jorge','Lili','David','Liliana','Beto','JJ']),\n 'Edad': pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),\n 'Rating': pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}", | |
"execution_count": 30, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Create a manual matrix with lists\nnames = ['Tomas','Jaime','Ricardo','Victor','Esteban','Susana','Jorge','Lili','David','Liliana','Beto','JJ']\nage = [25,26,25,23,30,29,23,34,40,30,51,46]\nrating = [4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65]\nd2 = list(zip(names, age, rating))\nprint(d2)", | |
"execution_count": 31, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "[('Tomas', 25, 4.23), ('Jaime', 26, 3.24), ('Ricardo', 25, 3.98), ('Victor', 23, 2.56), ('Esteban', 30, 3.2), ('Susana', 29, 4.6), ('Jorge', 23, 3.8), ('Lili', 34, 3.78), ('David', 40, 2.98), ('Liliana', 30, 4.8), ('Beto', 51, 4.1), ('JJ', 46, 3.65)]\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Create a sequence with an array\nx = np.arange(1,15,1)\na = np.array([[1, 1, 2], [3, 5, 8], [13, 21, 34]])\nprint(\"Arange:\",x)\nprint(\"Array\",a)", | |
"execution_count": 46, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Arange: [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14]\nArray [[ 1 1 2]\n [ 3 5 8]\n [13 21 34]]\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# DataFrames for data manipulation" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Create a panda Data frame with Dictionary or Matrix (they are the same)\ndf = pd.DataFrame(data = d1)\ndf = pd.DataFrame(data = d2, columns = ['Nombre','Edad','Rating'])\ndf", | |
"execution_count": 33, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 33, | |
"data": { | |
"text/plain": " Edad Nombe Rating\n0 25 Tomas 4.23\n1 26 Jaime 3.24\n2 25 Ricardo 3.98\n3 23 Victor 2.56\n4 30 Esteban 3.20\n5 29 Susana 4.60\n6 23 Jorge 3.80\n7 34 Lili 3.78\n8 40 David 2.98\n9 30 Liliana 4.80\n10 51 Beto 4.10\n11 46 JJ 3.65", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Edad</th>\n <th>Nombe</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>25</td>\n <td>Tomas</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>26</td>\n <td>Jaime</td>\n <td>3.24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>25</td>\n <td>Ricardo</td>\n <td>3.98</td>\n </tr>\n <tr>\n <th>3</th>\n <td>23</td>\n <td>Victor</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>4</th>\n <td>30</td>\n <td>Esteban</td>\n <td>3.20</td>\n </tr>\n <tr>\n <th>5</th>\n <td>29</td>\n <td>Susana</td>\n <td>4.60</td>\n </tr>\n <tr>\n <th>6</th>\n <td>23</td>\n <td>Jorge</td>\n <td>3.80</td>\n </tr>\n <tr>\n <th>7</th>\n <td>34</td>\n <td>Lili</td>\n <td>3.78</td>\n </tr>\n <tr>\n <th>8</th>\n <td>40</td>\n <td>David</td>\n <td>2.98</td>\n </tr>\n <tr>\n <th>9</th>\n <td>30</td>\n <td>Liliana</td>\n <td>4.80</td>\n </tr>\n <tr>\n <th>10</th>\n <td>51</td>\n <td>Beto</td>\n <td>4.10</td>\n </tr>\n <tr>\n <th>11</th>\n <td>46</td>\n <td>JJ</td>\n <td>3.65</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Data analysis and manipulation" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "df.info()", | |
"execution_count": 35, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 12 entries, 0 to 11\nData columns (total 3 columns):\nNombre 12 non-null object\nEdad 12 non-null int64\nRating 12 non-null float64\ndtypes: float64(1), int64(1), object(1)\nmemory usage: 368.0+ bytes\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# First rows\ndf.head(5)", | |
"execution_count": 36, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 36, | |
"data": { | |
"text/plain": " Nombre Edad Rating\n0 Tomas 25 4.23\n1 Jaime 26 3.24\n2 Ricardo 25 3.98\n3 Victor 23 2.56\n4 Esteban 30 3.20", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Tomas</td>\n <td>25</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Jaime</td>\n <td>26</td>\n <td>3.24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Ricardo</td>\n <td>25</td>\n <td>3.98</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Victor</td>\n <td>23</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Esteban</td>\n <td>30</td>\n <td>3.20</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# DataFrame dimensions\nprint(\"Dim:\",df.shape)\nprint(\"Rows:\",len(df.index))\nprint(\"Columns:\",len(df.columns))", | |
"execution_count": 37, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Dim: (12, 3)\nRows: 12\nColumns: 3\n", | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Statistical description on the data frame with 2 decimals\nround(df.describe(),2)", | |
"execution_count": 38, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 38, | |
"data": { | |
"text/plain": " Edad Rating\ncount 12.00 12.00\nmean 31.83 3.74\nstd 9.23 0.66\nmin 23.00 2.56\n25% 25.00 3.23\n50% 29.50 3.79\n75% 35.50 4.13\nmax 51.00 4.80", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>12.00</td>\n <td>12.00</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>31.83</td>\n <td>3.74</td>\n </tr>\n <tr>\n <th>std</th>\n <td>9.23</td>\n <td>0.66</td>\n </tr>\n <tr>\n <th>min</th>\n <td>23.00</td>\n <td>2.56</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>25.00</td>\n <td>3.23</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>29.50</td>\n <td>3.79</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>35.50</td>\n <td>4.13</td>\n </tr>\n <tr>\n <th>max</th>\n <td>51.00</td>\n <td>4.80</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Mean on all numerical features\ndf.mean()", | |
"execution_count": 39, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 39, | |
"data": { | |
"text/plain": "Edad 31.833333\nRating 3.743333\ndtype: float64" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Max value on a specific column\ndf[\"Edad\"].max()", | |
"execution_count": 40, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 40, | |
"data": { | |
"text/plain": "51" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Arrange columns, ascending, by a specific column and only show top 5\ndf.sort_values(by=\"Edad\", ascending = False).head(5)", | |
"execution_count": 41, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 41, | |
"data": { | |
"text/plain": " Nombre Edad Rating\n10 Beto 51 4.10\n11 JJ 46 3.65\n8 David 40 2.98\n7 Lili 34 3.78\n4 Esteban 30 3.20", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>10</th>\n <td>Beto</td>\n <td>51</td>\n <td>4.10</td>\n </tr>\n <tr>\n <th>11</th>\n <td>JJ</td>\n <td>46</td>\n <td>3.65</td>\n </tr>\n <tr>\n <th>8</th>\n <td>David</td>\n <td>40</td>\n <td>2.98</td>\n </tr>\n <tr>\n <th>7</th>\n <td>Lili</td>\n <td>34</td>\n <td>3.78</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Esteban</td>\n <td>30</td>\n <td>3.20</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Filter rows given a condition\ndf[df.Rating >= 4]", | |
"execution_count": 42, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 42, | |
"data": { | |
"text/plain": " Nombre Edad Rating\n0 Tomas 25 4.23\n5 Susana 29 4.60\n9 Liliana 30 4.80\n10 Beto 51 4.10", | |
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Nombre</th>\n <th>Edad</th>\n <th>Rating</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Tomas</td>\n <td>25</td>\n <td>4.23</td>\n </tr>\n <tr>\n <th>5</th>\n <td>Susana</td>\n <td>29</td>\n <td>4.60</td>\n </tr>\n <tr>\n <th>9</th>\n <td>Liliana</td>\n <td>30</td>\n <td>4.80</td>\n </tr>\n <tr>\n <th>10</th>\n <td>Beto</td>\n <td>51</td>\n <td>4.10</td>\n </tr>\n </tbody>\n</table>\n</div>" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Group, count, and arrange given a column's name\ndf[['Edad']].groupby('Edad')['Edad'].count().sort_values(ascending=False)", | |
"execution_count": 43, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 43, | |
"data": { | |
"text/plain": "Edad\n30 2\n25 2\n23 2\n51 1\n46 1\n40 1\n34 1\n29 1\n26 1\nName: Edad, dtype: int64" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Show all column's names\nlist(df.columns.values)", | |
"execution_count": 44, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"execution_count": 44, | |
"data": { | |
"text/plain": "['Nombre', 'Edad', 'Rating']" | |
}, | |
"metadata": {} | |
} | |
] | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "# Export to CSV files" | |
}, | |
{ | |
"metadata": { | |
"trusted": true | |
}, | |
"cell_type": "code", | |
"source": "# Export to a CSV file on a specific path\ndf.to_csv('02. Python and Packages/example.csv')", | |
"execution_count": 45, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": false | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3", | |
"language": "python" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.6.3", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
}, | |
"gist": { | |
"id": "b737bf065d2267636c25e888f8beca4b", | |
"data": { | |
"description": "numpy and pandas example", | |
"public": true | |
} | |
}, | |
"_draft": { | |
"nbviewer_url": "https://gist.github.com/b737bf065d2267636c25e888f8beca4b" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment