Last active
October 20, 2020 05:32
-
-
Save horvatha/0710ead976e8b6e7ffa0fb922e9f2df0 to your computer and use it in GitHub Desktop.
Fetching from wikipedia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"toc": true | |
}, | |
"source": [ | |
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n", | |
"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#URLs\" data-toc-modified-id=\"URLs-1\"><span class=\"toc-item-num\">1 </span>URLs</a></span></li><li><span><a href=\"#Request-&-Response\" data-toc-modified-id=\"Request-&-Response-2\"><span class=\"toc-item-num\">2 </span>Request & Response</a></span></li><li><span><a href=\"#Wrangling-HTML-With-BeautifulSoup\" data-toc-modified-id=\"Wrangling-HTML-With-BeautifulSoup-3\"><span class=\"toc-item-num\">3 </span>Wrangling HTML With BeautifulSoup</a></span></li><li><span><a href=\"#Title-of-HTML-content\" data-toc-modified-id=\"Title-of-HTML-content-4\"><span class=\"toc-item-num\">4 </span>Title of HTML content</a></span></li><li><span><a href=\"#Find-All-Tables\" data-toc-modified-id=\"Find-All-Tables-5\"><span class=\"toc-item-num\">5 </span>Find All Tables</a></span></li><li><span><a href=\"#Find-Right-Table-to-scrap\" data-toc-modified-id=\"Find-Right-Table-to-scrap-6\"><span class=\"toc-item-num\">6 </span>Find Right Table to scrap</a></span></li><li><span><a href=\"#Number-of-Columns\" data-toc-modified-id=\"Number-of-Columns-7\"><span class=\"toc-item-num\">7 </span>Number of Columns</a></span></li><li><span><a href=\"#Get-the-Rows\" data-toc-modified-id=\"Get-the-Rows-8\"><span class=\"toc-item-num\">8 </span>Get the Rows</a></span></li><li><span><a href=\"#Get-Table-Header-Attributes\" data-toc-modified-id=\"Get-Table-Header-Attributes-9\"><span class=\"toc-item-num\">9 </span>Get Table Header Attributes</a></span></li><li><span><a href=\"#Get-Tablular-Data\" data-toc-modified-id=\"Get-Tablular-Data-10\"><span class=\"toc-item-num\">10 </span>Get Tablular Data</a></span><ul class=\"toc-item\"><li><span><a href=\"#Data-Analysis\" data-toc-modified-id=\"Data-Analysis-10.1\"><span class=\"toc-item-num\">10.1 </span>Data Analysis</a></span></li><li><span><a href=\"#Scrap-the-Data\" data-toc-modified-id=\"Scrap-the-Data-10.2\"><span class=\"toc-item-num\">10.2 </span>Scrap the Data</a></span></li><li><span><a href=\"#Create-Dictionary\" data-toc-modified-id=\"Create-Dictionary-10.3\"><span class=\"toc-item-num\">10.3 </span>Create Dictionary</a></span></li><li><span><a href=\"#Create-DataFrame\" data-toc-modified-id=\"Create-DataFrame-10.4\"><span class=\"toc-item-num\">10.4 </span>Create DataFrame</a></span></li><li><span><a href=\"#Rename-DataFrame-Columns\" data-toc-modified-id=\"Rename-DataFrame-Columns-10.5\"><span class=\"toc-item-num\">10.5 </span>Rename DataFrame Columns</a></span></li><li><span><a href=\"#Top-5-Countries-with-Highest-Population\" data-toc-modified-id=\"Top-5-Countries-with-Highest-Population-10.6\"><span class=\"toc-item-num\">10.6 </span>Top 5 Countries with Highest Population</a></span></li><li><span><a href=\"#Lets-do-some-Clean-Up-!\" data-toc-modified-id=\"Lets-do-some-Clean-Up-!-10.7\"><span class=\"toc-item-num\">10.7 </span>Lets do some Clean Up !</a></span></li></ul></li><li><span><a href=\"#Visuals\" data-toc-modified-id=\"Visuals-11\"><span class=\"toc-item-num\">11 </span>Visuals</a></span></li></ul></div>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# for performing your HTTP requests\n", | |
"import requests \n", | |
"\n", | |
"# for xml & html scrapping \n", | |
"from bs4 import BeautifulSoup \n", | |
"\n", | |
"# for table analysis\n", | |
"import pandas as pd\n", | |
"\n", | |
"# write to csv\n", | |
"import csv\n", | |
"\n", | |
"# Time\n", | |
"import time\n", | |
"\n", | |
"#Visuals\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## URLs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# url of wikipedia page from which you want to scrap tabular data.\n", | |
"url1 = \"https://en.wikipedia.org/wiki/List_of_21st-century_classical_composers\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Request & Response" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<Response [200]>" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Session helps to object allows you to persist certain parameters across requests\n", | |
"# By default, Request will keep waiting for a response indefinitely. Therefore, it is advised to set the timeout parameter.\n", | |
"# If the request was successful, you should see the reponse output as '200'.\n", | |
"s = requests.Session()\n", | |
"response = s.get(url1, timeout=10)\n", | |
"#response2 = s.get(url2, timeout=5)\n", | |
"response\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Wrangling HTML With BeautifulSoup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# parse response content to html\n", | |
"soup = BeautifulSoup(response.content, 'html.parser')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# to view the content in html format\n", | |
"pretty_soup = soup.prettify()\n", | |
"# print(pretty_soup)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Title of HTML content" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'List of 21st-century classical composers - Wikipedia'" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# title of Wikipedia page\n", | |
"soup.title.string" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Find All Tables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# find all the tables in the html\n", | |
"all_tables=soup.find_all('table')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Find Right Table to scrap" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# get right table to scrap\n", | |
"right_table=soup.find('table', {\"class\":'wikitable sortable'})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Number of Columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"6" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Number of columns in the table\n", | |
"for row in right_table.findAll(\"tr\"):\n", | |
" cells = row.findAll('td')\n", | |
"\n", | |
"len(cells)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get the Rows" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"2924" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# number of rows in the table including header\n", | |
"rows = right_table.findAll(\"tr\")\n", | |
"len(rows)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get Table Header Attributes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['Name', 'Year of', 'Nationality', 'Notable 21st-century works', 'Remarks']\n", | |
"------------\n", | |
"5\n" | |
] | |
} | |
], | |
"source": [ | |
"# header attributes of the table\n", | |
"header = [th.text.rstrip() for th in rows[0].find_all('th')]\n", | |
"print(header)\n", | |
"print('------------')\n", | |
"print(len(header))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Get Tablular Data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Data Analysis" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"first_data_row = 2 # Usually 1, but if the header has 2 rows, it is 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"lst_data_raw = []\n", | |
"for row in rows[first_data_row:]:\n", | |
" tds = row.select('td')\n", | |
" name_col = tds[0]\n", | |
" href = name_col.a['href']\n", | |
" article = href.split('/')[-1] if href.startswith('/wiki') else ''\n", | |
" data = [article, name_col.text.rstrip()]\n", | |
" data.extend([d.text.rstrip() for d in tds[1:]])\n", | |
" lst_data_raw.append(data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"/w/index.php?title=Carl_Bergstr%C3%B8m-Nielsen&action=edit&redlink=1\n", | |
"/w/index.php?title=Javier_Jacinto&action=edit&redlink=1\n", | |
"/w/index.php?title=Zoltan_Paulinyi&action=edit&redlink=1\n", | |
"/w/index.php?title=Matthias_Kadar&action=edit&redlink=1\n", | |
"/w/index.php?title=Robert_R%C3%B8nnes&action=edit&redlink=1\n" | |
] | |
} | |
], | |
"source": [ | |
"for row in rows[first_data_row:]:\n", | |
" tds = row.select('td')\n", | |
" name_col = tds[0]\n", | |
" href = name_col.a['href']\n", | |
" if not href.startswith('/wiki'):\n", | |
" print(href)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[['Cassandra_Miller',\n", | |
" 'Cassandra Miller',\n", | |
" '1976',\n", | |
" '',\n", | |
" 'Canadian',\n", | |
" 'About Bach (string quartet)',\n", | |
" ''],\n", | |
" ['Caio_Fac%C3%B3',\n", | |
" 'Caio Facó',\n", | |
" '1992',\n", | |
" '',\n", | |
" 'Brazilian',\n", | |
" 'Diário das Narrativas Fantásticas: Uma fantasia sobre a história da América do Sul (for chamber orchestra)[1]',\n", | |
" 'A major work about Latin American culture'],\n", | |
" ['Rolf_Riehm', 'Rolf Riehm', '1937', '', 'German', 'Sirenen (opera)', '']]" | |
] | |
}, | |
"execution_count": 114, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# sample records\n", | |
"lst_data_raw[0:3]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"7" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# length of each record\n", | |
"len(lst_data_raw[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of row : 2924\n", | |
"----------------\n", | |
"<tr>\n", | |
"<td><a href=\"/wiki/Cassandra_Miller\" title=\"Cassandra Miller\">Cassandra Miller</a></td>\n", | |
"<td>1976</td>\n", | |
"<td></td>\n", | |
"<td>Canadian</td>\n", | |
"<td><i>About Bach</i> (string quartet)</td>\n", | |
"<td>\n", | |
"</td></tr>\n", | |
"----------------\n", | |
"Second Attribute is has link reference\n", | |
"----------------\n", | |
"[<th data-sort-type=\"text\" rowspan=\"2\">Name\n", | |
"</th>, <th colspan=\"2\">Year of\n", | |
"</th>, <th rowspan=\"2\">Nationality\n", | |
"</th>, <th rowspan=\"2\">Notable 21st-century works\n", | |
"</th>, <th rowspan=\"2\">Remarks\n", | |
"</th>]\n", | |
"----------------\n", | |
"Cassandra Miller\n" | |
] | |
} | |
], | |
"source": [ | |
"# html of each table record\n", | |
"\n", | |
"list_row = []\n", | |
"for row in right_table.findAll(\"tr\"):\n", | |
" list_row.append(row)\n", | |
"\n", | |
" \n", | |
"print('Number of row :',len(list_row))\n", | |
"print('----------------')\n", | |
"print(list_row[first_data_row])\n", | |
"print('----------------')\n", | |
"print('Second Attribute is has link reference')\n", | |
"print('----------------')\n", | |
"print(list_row[0].findAll('th'))\n", | |
"print('----------------')\n", | |
"print(list_row[first_data_row].find('a').text)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Scrap the Data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Create Dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 115, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[['Cassandra_Miller',\n", | |
" 'Cassandra Miller',\n", | |
" '1976',\n", | |
" '',\n", | |
" 'Canadian',\n", | |
" 'About Bach (string quartet)',\n", | |
" ''],\n", | |
" ['Caio_Fac%C3%B3',\n", | |
" 'Caio Facó',\n", | |
" '1992',\n", | |
" '',\n", | |
" 'Brazilian',\n", | |
" 'Diário das Narrativas Fantásticas: Uma fantasia sobre a história da América do Sul (for chamber orchestra)[1]',\n", | |
" 'A major work about Latin American culture'],\n", | |
" ['Rolf_Riehm', 'Rolf Riehm', '1937', '', 'German', 'Sirenen (opera)', ''],\n", | |
" ['Hiro_Fujikake',\n", | |
" 'Hiro Fujikake',\n", | |
" '1949',\n", | |
" '',\n", | |
" 'Japan',\n", | |
" 'Pastoral Fantasy (1975),The Rope Crest (1977), Symphony Japan(1993), Symphony IZUMO(2005)',\n", | |
" '']]" | |
] | |
}, | |
"execution_count": 115, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lst_data_raw[:4]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data = list(zip(*lst_data_raw))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('Cassandra_Miller', 'Caio_Fac%C3%B3', 'Rolf_Riehm', 'Hiro_Fujikake'),\n", | |
" ('Cassandra Miller', 'Caio Facó', 'Rolf Riehm', 'Hiro Fujikake'),\n", | |
" ('1976', '1992', '1937', '1949'),\n", | |
" ('', '', '', ''),\n", | |
" ('Canadian', 'Brazilian', 'German', 'Japan'),\n", | |
" ('About Bach (string quartet)',\n", | |
" 'Diário das Narrativas Fantásticas: Uma fantasia sobre a história da América do Sul (for chamber orchestra)[1]',\n", | |
" 'Sirenen (opera)',\n", | |
" 'Pastoral Fantasy (1975),The Rope Crest (1977), Symphony Japan(1993), Symphony IZUMO(2005)')]" | |
] | |
}, | |
"execution_count": 117, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"[i[:4] for i in data]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dat = dict(zip(['article', 'name', 'birth', 'death', 'nationality', 'notable_21_century_works', 'comment'], data))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# dat" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Create DataFrame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>article</th>\n", | |
" <th>name</th>\n", | |
" <th>birth</th>\n", | |
" <th>death</th>\n", | |
" <th>nationality</th>\n", | |
" <th>notable_21_century_works</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Cassandra_Miller</td>\n", | |
" <td>Cassandra Miller</td>\n", | |
" <td>1976</td>\n", | |
" <td></td>\n", | |
" <td>Canadian</td>\n", | |
" <td>About Bach (string quartet)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Caio_Fac%C3%B3</td>\n", | |
" <td>Caio Facó</td>\n", | |
" <td>1992</td>\n", | |
" <td></td>\n", | |
" <td>Brazilian</td>\n", | |
" <td>Diário das Narrativas Fantásticas: Uma fantasi...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Rolf_Riehm</td>\n", | |
" <td>Rolf Riehm</td>\n", | |
" <td>1937</td>\n", | |
" <td></td>\n", | |
" <td>German</td>\n", | |
" <td>Sirenen (opera)</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Hiro_Fujikake</td>\n", | |
" <td>Hiro Fujikake</td>\n", | |
" <td>1949</td>\n", | |
" <td></td>\n", | |
" <td>Japan</td>\n", | |
" <td>Pastoral Fantasy (1975),The Rope Crest (1977),...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Ann_Cleare</td>\n", | |
" <td>Ann Cleare</td>\n", | |
" <td>1983</td>\n", | |
" <td></td>\n", | |
" <td>Irish</td>\n", | |
" <td>Claustrophobia (orchestra)</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" article name birth death nationality \\\n", | |
"0 Cassandra_Miller Cassandra Miller 1976 Canadian \n", | |
"1 Caio_Fac%C3%B3 Caio Facó 1992 Brazilian \n", | |
"2 Rolf_Riehm Rolf Riehm 1937 German \n", | |
"3 Hiro_Fujikake Hiro Fujikake 1949 Japan \n", | |
"4 Ann_Cleare Ann Cleare 1983 Irish \n", | |
"\n", | |
" notable_21_century_works \n", | |
"0 About Bach (string quartet) \n", | |
"1 Diário das Narrativas Fantásticas: Uma fantasi... \n", | |
"2 Sirenen (opera) \n", | |
"3 Pastoral Fantasy (1975),The Rope Crest (1977),... \n", | |
"4 Claustrophobia (orchestra) " | |
] | |
}, | |
"execution_count": 119, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# convert dict to DataFrame\n", | |
"df = pd.DataFrame(dat)\n", | |
"\n", | |
"# Top 5 records\n", | |
"df.head(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>article</th>\n", | |
" <th>Name</th>\n", | |
" <th>birth</th>\n", | |
" <th>death</th>\n", | |
" <th>Nationality</th>\n", | |
" <th>notable_works</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2917</th>\n", | |
" <td>Juliana_Hall</td>\n", | |
" <td>Juliana Hall</td>\n", | |
" <td>1958</td>\n", | |
" <td></td>\n", | |
" <td>American</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2918</th>\n", | |
" <td>Brian_T._Field</td>\n", | |
" <td>Brian T. Field</td>\n", | |
" <td>1967</td>\n", | |
" <td></td>\n", | |
" <td>American</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2919</th>\n", | |
" <td>Craig_Bohmler</td>\n", | |
" <td>Craig Bohmler</td>\n", | |
" <td>1956</td>\n", | |
" <td></td>\n", | |
" <td>American</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2920</th>\n", | |
" <td>Evgeny_Kissin</td>\n", | |
" <td>Evgeny Kissin</td>\n", | |
" <td>1971</td>\n", | |
" <td></td>\n", | |
" <td>Russian, British, Israeli</td>\n", | |
" <td>Tango, Meditation, Intermezzo, Toccata, Violon...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2921</th>\n", | |
" <td>Andrew_March</td>\n", | |
" <td>Andrew March</td>\n", | |
" <td>1973</td>\n", | |
" <td></td>\n", | |
" <td>English</td>\n", | |
" <td>Sanguis Venenatus</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" article Name birth death Nationality \\\n", | |
"2917 Juliana_Hall Juliana Hall 1958 American \n", | |
"2918 Brian_T._Field Brian T. Field 1967 American \n", | |
"2919 Craig_Bohmler Craig Bohmler 1956 American \n", | |
"2920 Evgeny_Kissin Evgeny Kissin 1971 Russian, British, Israeli \n", | |
"2921 Andrew_March Andrew March 1973 English \n", | |
"\n", | |
" notable_works \n", | |
"2917 \n", | |
"2918 \n", | |
"2919 \n", | |
"2920 Tango, Meditation, Intermezzo, Toccata, Violon... \n", | |
"2921 Sanguis Venenatus " | |
] | |
}, | |
"execution_count": 70, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Last 5 records\n", | |
"df.tail(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['era'] = '21st-century classical'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Saving" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 121, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_csv('composers_21_century.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 123, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>article</th>\n", | |
" <th>name</th>\n", | |
" <th>birth</th>\n", | |
" <th>death</th>\n", | |
" <th>nationality</th>\n", | |
" <th>notable_21_century_works</th>\n", | |
" <th>era</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>Eric_Salzman</td>\n", | |
" <td>Eric Salzman</td>\n", | |
" <td>1933</td>\n", | |
" <td>2017</td>\n", | |
" <td>American</td>\n", | |
" <td>The True Last Words of Dutch Schultz (opera)</td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>J%C3%B3hann_J%C3%B3hannsson</td>\n", | |
" <td>Jóhann Jóhannsson</td>\n", | |
" <td>1969</td>\n", | |
" <td>2018</td>\n", | |
" <td>Icelandic</td>\n", | |
" <td>And in the Endless Pause There Came the Sound ...</td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>Otomar_Kv%C4%9Bch</td>\n", | |
" <td>Otomar Kvěch</td>\n", | |
" <td>1950</td>\n", | |
" <td>2018</td>\n", | |
" <td>Czech</td>\n", | |
" <td>String Quartet No. 7</td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>Milko_Kelemen</td>\n", | |
" <td>Milko Kelemen</td>\n", | |
" <td>1924</td>\n", | |
" <td>2018</td>\n", | |
" <td>Croatian</td>\n", | |
" <td>Tromberia for Trumpet and Orchestra</td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>Graciela_Agudelo</td>\n", | |
" <td>Graciela Agudelo</td>\n", | |
" <td>1945</td>\n", | |
" <td>2018</td>\n", | |
" <td>Mexican</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2892</th>\n", | |
" <td>Ruth_Zechlin</td>\n", | |
" <td>Ruth Zechlin</td>\n", | |
" <td>1926</td>\n", | |
" <td>2007</td>\n", | |
" <td>German</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2894</th>\n", | |
" <td>Friedrich_Zehm</td>\n", | |
" <td>Friedrich Zehm</td>\n", | |
" <td>1923</td>\n", | |
" <td>2007</td>\n", | |
" <td>German</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2896</th>\n", | |
" <td>Hans_Zender</td>\n", | |
" <td>Hans Zender</td>\n", | |
" <td>1936</td>\n", | |
" <td>2019</td>\n", | |
" <td>German</td>\n", | |
" <td>Chief Joseph (opera)</td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2898</th>\n", | |
" <td>Zhu_Jian%27er</td>\n", | |
" <td>Zhu Jian'er</td>\n", | |
" <td>1922</td>\n", | |
" <td>2017</td>\n", | |
" <td>Chinese</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2902</th>\n", | |
" <td>Marilyn_J._Ziffrin</td>\n", | |
" <td>Marilyn J. Ziffrin</td>\n", | |
" <td>1926</td>\n", | |
" <td>2018</td>\n", | |
" <td>American</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>693 rows × 7 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" article name birth death nationality \\\n", | |
"7 Eric_Salzman Eric Salzman 1933 2017 American \n", | |
"9 J%C3%B3hann_J%C3%B3hannsson Jóhann Jóhannsson 1969 2018 Icelandic \n", | |
"10 Otomar_Kv%C4%9Bch Otomar Kvěch 1950 2018 Czech \n", | |
"12 Milko_Kelemen Milko Kelemen 1924 2018 Croatian \n", | |
"14 Graciela_Agudelo Graciela Agudelo 1945 2018 Mexican \n", | |
"... ... ... ... ... ... \n", | |
"2892 Ruth_Zechlin Ruth Zechlin 1926 2007 German \n", | |
"2894 Friedrich_Zehm Friedrich Zehm 1923 2007 German \n", | |
"2896 Hans_Zender Hans Zender 1936 2019 German \n", | |
"2898 Zhu_Jian%27er Zhu Jian'er 1922 2017 Chinese \n", | |
"2902 Marilyn_J._Ziffrin Marilyn J. Ziffrin 1926 2018 American \n", | |
"\n", | |
" notable_21_century_works \\\n", | |
"7 The True Last Words of Dutch Schultz (opera) \n", | |
"9 And in the Endless Pause There Came the Sound ... \n", | |
"10 String Quartet No. 7 \n", | |
"12 Tromberia for Trumpet and Orchestra \n", | |
"14 \n", | |
"... ... \n", | |
"2892 \n", | |
"2894 \n", | |
"2896 Chief Joseph (opera) \n", | |
"2898 \n", | |
"2902 \n", | |
"\n", | |
" era \n", | |
"7 21st-century classical \n", | |
"9 21st-century classical \n", | |
"10 21st-century classical \n", | |
"12 21st-century classical \n", | |
"14 21st-century classical \n", | |
"... ... \n", | |
"2892 21st-century classical \n", | |
"2894 21st-century classical \n", | |
"2896 21st-century classical \n", | |
"2898 21st-century classical \n", | |
"2902 21st-century classical \n", | |
"\n", | |
"[693 rows x 7 columns]" | |
] | |
}, | |
"execution_count": 123, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[~(df.death == \"\")]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 122, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>article</th>\n", | |
" <th>name</th>\n", | |
" <th>birth</th>\n", | |
" <th>death</th>\n", | |
" <th>nationality</th>\n", | |
" <th>notable_21_century_works</th>\n", | |
" <th>era</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>181</th>\n", | |
" <td></td>\n", | |
" <td>Carl Bergstrøm-Nielsen [da; de]</td>\n", | |
" <td>1951</td>\n", | |
" <td></td>\n", | |
" <td>Danish</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>224</th>\n", | |
" <td></td>\n", | |
" <td>Javier Jacinto [es; eu]</td>\n", | |
" <td>1968</td>\n", | |
" <td></td>\n", | |
" <td>Spanish</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>821</th>\n", | |
" <td></td>\n", | |
" <td>Zoltan Paulinyi</td>\n", | |
" <td>1977</td>\n", | |
" <td></td>\n", | |
" <td>Brazilian</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1648</th>\n", | |
" <td></td>\n", | |
" <td>Matthias Kadar</td>\n", | |
" <td>1977</td>\n", | |
" <td></td>\n", | |
" <td>Hungarian-German</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2577</th>\n", | |
" <td></td>\n", | |
" <td>Robert Rønnes [no]</td>\n", | |
" <td>1959</td>\n", | |
" <td></td>\n", | |
" <td>Norwegian</td>\n", | |
" <td></td>\n", | |
" <td>21st-century classical</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" article name birth death nationality \\\n", | |
"181 Carl Bergstrøm-Nielsen [da; de] 1951 Danish \n", | |
"224 Javier Jacinto [es; eu] 1968 Spanish \n", | |
"821 Zoltan Paulinyi 1977 Brazilian \n", | |
"1648 Matthias Kadar 1977 Hungarian-German \n", | |
"2577 Robert Rønnes [no] 1959 Norwegian \n", | |
"\n", | |
" notable_21_century_works era \n", | |
"181 21st-century classical \n", | |
"224 21st-century classical \n", | |
"821 21st-century classical \n", | |
"1648 21st-century classical \n", | |
"2577 21st-century classical " | |
] | |
}, | |
"execution_count": 122, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[df.article == '']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"article object\n", | |
"Name object\n", | |
"birth object\n", | |
"death object\n", | |
"Nationality object\n", | |
"notable_21_century_works object\n", | |
"era object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 86, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": true, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": true | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment