Created
October 31, 2018 16:42
-
-
Save aaronaddleman/5ef9277582657295a469f20536614286 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"metadata":{"language_info":{"name":"python","version":"3.6.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"}},"nbformat_minor":2,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Read in Data Files\n\n* How to read in csv files\n* Files with no headers\n* Files with delimiters other than commas","metadata":{}},{"cell_type":"code","source":"import pandas as pd","metadata":{"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"pd.read_csv('small_data.csv')","metadata":{"trusted":true},"execution_count":2,"outputs":[{"execution_count":2,"output_type":"execute_result","data":{"text/plain":" name gender country age\n0 Sophia F Bulgaria 23\n1 John M USA 24\n2 Jennifer F USA 46\n3 Edgar M England 13\n4 Patrick M Netherlands 72","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>name</th>\n <th>gender</th>\n <th>country</th>\n <th>age</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Sophia</td>\n <td>F</td>\n <td>Bulgaria</td>\n <td>23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>John</td>\n <td>M</td>\n <td>USA</td>\n <td>24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Jennifer</td>\n <td>F</td>\n <td>USA</td>\n <td>46</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Edgar</td>\n <td>M</td>\n <td>England</td>\n <td>13</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Patrick</td>\n <td>M</td>\n <td>Netherlands</td>\n <td>72</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# reading in data that has no headers... look... the headers are incorrect\npd.read_csv('adult.data', nrows=5)","metadata":{"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":" 39 State-gov 77516 Bachelors 13 Never-married \\\n0 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n1 38 Private 215646 HS-grad 9 Divorced \n2 53 Private 234721 11th 7 Married-civ-spouse \n3 28 Private 338409 Bachelors 13 Married-civ-spouse \n4 37 Private 284582 Masters 14 Married-civ-spouse \n\n Adm-clerical Not-in-family White Male 2174 0 40 \\\n0 Exec-managerial Husband White Male 0 0 13 \n1 Handlers-cleaners Not-in-family White Male 0 0 40 \n2 Handlers-cleaners Husband Black Male 0 0 40 \n3 Prof-specialty Wife Black Female 0 0 40 \n4 Exec-managerial Wife White Female 0 0 40 \n\n United-States <=50K \n0 United-States <=50K \n1 United-States <=50K \n2 United-States <=50K \n3 Cuba <=50K \n4 United-States <=50K ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>39</th>\n <th>State-gov</th>\n <th>77516</th>\n <th>Bachelors</th>\n <th>13</th>\n <th>Never-married</th>\n <th>Adm-clerical</th>\n <th>Not-in-family</th>\n <th>White</th>\n <th>Male</th>\n <th>2174</th>\n <th>0</th>\n <th>40</th>\n <th>United-States</th>\n <th><=50K</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>50</td>\n <td>Self-emp-not-inc</td>\n <td>83311</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Husband</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>13</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>1</th>\n <td>38</td>\n <td>Private</td>\n <td>215646</td>\n <td>HS-grad</td>\n <td>9</td>\n <td>Divorced</td>\n <td>Handlers-cleaners</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>2</th>\n <td>53</td>\n <td>Private</td>\n <td>234721</td>\n <td>11th</td>\n <td>7</td>\n <td>Married-civ-spouse</td>\n <td>Handlers-cleaners</td>\n <td>Husband</td>\n <td>Black</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>3</th>\n <td>28</td>\n <td>Private</td>\n <td>338409</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>Cuba</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>4</th>\n <td>37</td>\n <td>Private</td>\n <td>284582</td>\n <td>Masters</td>\n <td>14</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Wife</td>\n <td>White</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# specify no headers exist\npd.read_csv('adult.data', nrows=5, header=None)","metadata":{"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" 0 1 2 3 4 5 \\\n0 39 State-gov 77516 Bachelors 13 Never-married \n1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n2 38 Private 215646 HS-grad 9 Divorced \n3 53 Private 234721 11th 7 Married-civ-spouse \n4 28 Private 338409 Bachelors 13 Married-civ-spouse \n\n 6 7 8 9 10 11 12 \\\n0 Adm-clerical Not-in-family White Male 2174 0 40 \n1 Exec-managerial Husband White Male 0 0 13 \n2 Handlers-cleaners Not-in-family White Male 0 0 40 \n3 Handlers-cleaners Husband Black Male 0 0 40 \n4 Prof-specialty Wife Black Female 0 0 40 \n\n 13 14 \n0 United-States <=50K \n1 United-States <=50K \n2 United-States <=50K \n3 United-States <=50K \n4 Cuba <=50K ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n <th>9</th>\n <th>10</th>\n <th>11</th>\n <th>12</th>\n <th>13</th>\n <th>14</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>39</td>\n <td>State-gov</td>\n <td>77516</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Never-married</td>\n <td>Adm-clerical</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>2174</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>1</th>\n <td>50</td>\n <td>Self-emp-not-inc</td>\n <td>83311</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Husband</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>13</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>2</th>\n <td>38</td>\n <td>Private</td>\n <td>215646</td>\n <td>HS-grad</td>\n <td>9</td>\n <td>Divorced</td>\n <td>Handlers-cleaners</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>3</th>\n <td>53</td>\n <td>Private</td>\n <td>234721</td>\n <td>11th</td>\n <td>7</td>\n <td>Married-civ-spouse</td>\n <td>Handlers-cleaners</td>\n <td>Husband</td>\n <td>Black</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>4</th>\n <td>28</td>\n <td>Private</td>\n <td>338409</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>Cuba</td>\n <td><=50K</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"col_names = [\n \"age\",\n \"type_employer\",\n \"fnlwgt\",\n \"education\",\n \"education_num\",\n \"marital\",\n \"occupation\",\n \"relationship\",\n \"race\",\n \"sex\",\n \"capital_gain\",\n \"capital_loss\",\n \"hr_per_week\",\n \"country\",\n \"income\"\n]","metadata":{"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"pd.read_csv('adult.data', nrows=5, header=None, names=col_names)","metadata":{"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":" age type_employer fnlwgt education education_num \\\n0 39 State-gov 77516 Bachelors 13 \n1 50 Self-emp-not-inc 83311 Bachelors 13 \n2 38 Private 215646 HS-grad 9 \n3 53 Private 234721 11th 7 \n4 28 Private 338409 Bachelors 13 \n\n marital occupation relationship race sex \\\n0 Never-married Adm-clerical Not-in-family White Male \n1 Married-civ-spouse Exec-managerial Husband White Male \n2 Divorced Handlers-cleaners Not-in-family White Male \n3 Married-civ-spouse Handlers-cleaners Husband Black Male \n4 Married-civ-spouse Prof-specialty Wife Black Female \n\n capital_gain capital_loss hr_per_week country income \n0 2174 0 40 United-States <=50K \n1 0 0 13 United-States <=50K \n2 0 0 40 United-States <=50K \n3 0 0 40 United-States <=50K \n4 0 0 40 Cuba <=50K ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>age</th>\n <th>type_employer</th>\n <th>fnlwgt</th>\n <th>education</th>\n <th>education_num</th>\n <th>marital</th>\n <th>occupation</th>\n <th>relationship</th>\n <th>race</th>\n <th>sex</th>\n <th>capital_gain</th>\n <th>capital_loss</th>\n <th>hr_per_week</th>\n <th>country</th>\n <th>income</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>39</td>\n <td>State-gov</td>\n <td>77516</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Never-married</td>\n <td>Adm-clerical</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>2174</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>1</th>\n <td>50</td>\n <td>Self-emp-not-inc</td>\n <td>83311</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Husband</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>13</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>2</th>\n <td>38</td>\n <td>Private</td>\n <td>215646</td>\n <td>HS-grad</td>\n <td>9</td>\n <td>Divorced</td>\n <td>Handlers-cleaners</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>3</th>\n <td>53</td>\n <td>Private</td>\n <td>234721</td>\n <td>11th</td>\n <td>7</td>\n <td>Married-civ-spouse</td>\n <td>Handlers-cleaners</td>\n <td>Husband</td>\n <td>Black</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td><=50K</td>\n </tr>\n <tr>\n <th>4</th>\n <td>28</td>\n <td>Private</td>\n <td>338409</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>Cuba</td>\n <td><=50K</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# issues: no header, delimeter is not a comma\npd.read_csv('auto-mpg.data', nrows=5, header=None)","metadata":{"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":" 0\n0 18.0 8 307.0 130.0 3504. 12...\n1 15.0 8 350.0 165.0 3693. 11...\n2 18.0 8 318.0 150.0 3436. 11...\n3 16.0 8 304.0 150.0 3433. 12...\n4 17.0 8 302.0 140.0 3449. 10...","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>18.0 8 307.0 130.0 3504. 12...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>15.0 8 350.0 165.0 3693. 11...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>18.0 8 318.0 150.0 3436. 11...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>16.0 8 304.0 150.0 3433. 12...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>17.0 8 302.0 140.0 3449. 10...</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# read the file, line by line\nwith open('auto-mpg.data') as f:\n for line_index, line in enumerate(f):\n print(line)\n if line_index == 5:\n break","metadata":{"trusted":true},"execution_count":30,"outputs":[{"name":"stdout","text":"18.0 8 307.0 130.0 3504. 12.0 70 1\t\"chevrolet chevelle malibu\"\n\n15.0 8 350.0 165.0 3693. 11.5 70 1\t\"buick skylark 320\"\n\n18.0 8 318.0 150.0 3436. 11.0 70 1\t\"plymouth satellite\"\n\n16.0 8 304.0 150.0 3433. 12.0 70 1\t\"amc rebel sst\"\n\n17.0 8 302.0 140.0 3449. 10.5 70 1\t\"ford torino\"\n\n15.0 8 429.0 198.0 4341. 10.0 70 1\t\"ford galaxie 500\"\n\n","output_type":"stream"}]},{"cell_type":"code","source":"pd.read_csv('auto-mpg.data', nrows=5, header=None, sep='\\s+')","metadata":{"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":" 0 1 2 3 4 5 6 7 8\n0 18.0 8 307.0 130.0 3504.0 12.0 70 1 chevrolet chevelle malibu\n1 15.0 8 350.0 165.0 3693.0 11.5 70 1 buick skylark 320\n2 18.0 8 318.0 150.0 3436.0 11.0 70 1 plymouth satellite\n3 16.0 8 304.0 150.0 3433.0 12.0 70 1 amc rebel sst\n4 17.0 8 302.0 140.0 3449.0 10.5 70 1 ford torino","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>18.0</td>\n <td>8</td>\n <td>307.0</td>\n <td>130.0</td>\n <td>3504.0</td>\n <td>12.0</td>\n <td>70</td>\n <td>1</td>\n <td>chevrolet chevelle malibu</td>\n </tr>\n <tr>\n <th>1</th>\n <td>15.0</td>\n <td>8</td>\n <td>350.0</td>\n <td>165.0</td>\n <td>3693.0</td>\n <td>11.5</td>\n <td>70</td>\n <td>1</td>\n <td>buick skylark 320</td>\n </tr>\n <tr>\n <th>2</th>\n <td>18.0</td>\n <td>8</td>\n <td>318.0</td>\n <td>150.0</td>\n <td>3436.0</td>\n <td>11.0</td>\n <td>70</td>\n <td>1</td>\n <td>plymouth satellite</td>\n </tr>\n <tr>\n <th>3</th>\n <td>16.0</td>\n <td>8</td>\n <td>304.0</td>\n <td>150.0</td>\n <td>3433.0</td>\n <td>12.0</td>\n <td>70</td>\n <td>1</td>\n <td>amc rebel sst</td>\n </tr>\n <tr>\n <th>4</th>\n <td>17.0</td>\n <td>8</td>\n <td>302.0</td>\n <td>140.0</td>\n <td>3449.0</td>\n <td>10.5</td>\n <td>70</td>\n <td>1</td>\n <td>ford torino</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment