Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save clemsos/de225e4b683a4643d01e8f50f5fa050b to your computer and use it in GitHub Desktop.
Save clemsos/de225e4b683a4643d01e8f50f5fa050b to your computer and use it in GitHub Desktop.
Data Parser - List of public corporations by market capitalization
{
"cells": [
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import csv\n",
"import pandas as pd\n",
"\n",
"# Clean data\n",
"#\n",
"# From https://en.wikipedia.org/wiki/List_of_public_corporations_by_market_capitalization\n",
"# Data Source : https://docs.google.com/spreadsheets/d/1NxEouZb0AHGZ3OcwUPp8Ga1LmBjp37NABtwdcYSE91I/edit#gid=115560802\n",
"\n",
"# Visualization\n",
"# http://blockbuilder.org/clemsos/4923d7601f5e4b208b740123679047f7"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(80, 10)\n"
]
}
],
"source": [
"# parse fist CSV\n",
"first = \"raw/Data - List of public corporations by market capitalization - 2007-2006.csv\"\n",
"\n",
"df = pd.read_csv(first)\n",
"df.dropna(how=\"all\", inplace=True) # drop empty lines\n",
"\n",
"_id = df[\"Year\"].astype(int).map(str) + \"-\" + df[\"Q\"]+\"-\"+df[\"Rank\"].astype(int).map(str)\n",
"q = df[\"Name\"].str.split(' ', expand=True)\n",
"final_first = pd.concat([_id, q, df], axis=1, join='inner')\n",
"final_first = final_first.drop(['Name'], axis=1)\n",
"\n",
"# rename columns\n",
"final_first.columns.values[0] = 'Id'\n",
"final_first.columns.values[1] = 'Company'\n",
"final_first.columns.values[2] = 'Company Link'\n",
"final_first = final_first.rename(index=str, columns={\"Q\": \"Quarter\"})\n",
"final_first[\"Headquarters Link\"] = None\n",
"\n",
"final_first.head()\n",
"\n",
"print final_first.shape\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(100, 10)\n",
"(100, 11)\n",
"(100, 11)\n",
"(100, 11)\n",
"(100, 11)\n",
"(400, 11)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Id</th>\n",
" <th>Company</th>\n",
" <th>Company Link</th>\n",
" <th>Market value</th>\n",
" <th>Notes</th>\n",
" <th>Year</th>\n",
" <th>Quarter</th>\n",
" <th>Rank</th>\n",
" <th>Headquarters</th>\n",
" <th>Headquarters Link</th>\n",
" <th>Primary industry</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2017-Q1-3</td>\n",
" <td>Microsoft</td>\n",
" <td>(https://en.wikipedia.org/wiki/Microsoft)</td>\n",
" <td>508,935</td>\n",
" <td>None</td>\n",
" <td>2017</td>\n",
" <td>Q1</td>\n",
" <td>3</td>\n",
" <td>United_States</td>\n",
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2017-Q1-1</td>\n",
" <td>Apple Inc.</td>\n",
" <td>(https://en.wikipedia.org/wiki/Apple_Inc.)</td>\n",
" <td>753,718</td>\n",
" <td>None</td>\n",
" <td>2017</td>\n",
" <td>Q1</td>\n",
" <td>1</td>\n",
" <td>United_States</td>\n",
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2017-Q1-4</td>\n",
" <td>Amazon.com</td>\n",
" <td>(https://en.wikipedia.org/wiki/Amazon.com)</td>\n",
" <td>423,031</td>\n",
" <td>None</td>\n",
" <td>2017</td>\n",
" <td>Q1</td>\n",
" <td>4</td>\n",
" <td>United_States</td>\n",
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2017-Q1-2</td>\n",
" <td>Alphabet Inc.</td>\n",
" <td>(https://en.wikipedia.org/wiki/Alphabet_Inc.)</td>\n",
" <td>573,570 [14]</td>\n",
" <td>(#cite_note-ya-15)</td>\n",
" <td>2017</td>\n",
" <td>Q1</td>\n",
" <td>2</td>\n",
" <td>United_States</td>\n",
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2017-Q1-5</td>\n",
" <td>Berkshire Hathaway</td>\n",
" <td>(https://en.wikipedia.org/wiki/Berkshire_Hatha...</td>\n",
" <td>410,880 [15]</td>\n",
" <td>(#cite_note-yb-16)</td>\n",
" <td>2017</td>\n",
" <td>Q1</td>\n",
" <td>5</td>\n",
" <td>United_States</td>\n",
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Id Company \\\n",
"0 2017-Q1-3 Microsoft \n",
"1 2017-Q1-1 Apple Inc. \n",
"2 2017-Q1-4 Amazon.com \n",
"3 2017-Q1-2 Alphabet Inc. \n",
"4 2017-Q1-5 Berkshire Hathaway \n",
"\n",
" Company Link Market value \\\n",
"0 (https://en.wikipedia.org/wiki/Microsoft) 508,935 \n",
"1 (https://en.wikipedia.org/wiki/Apple_Inc.) 753,718 \n",
"2 (https://en.wikipedia.org/wiki/Amazon.com) 423,031 \n",
"3 (https://en.wikipedia.org/wiki/Alphabet_Inc.) 573,570 [14] \n",
"4 (https://en.wikipedia.org/wiki/Berkshire_Hatha... 410,880 [15] \n",
"\n",
" Notes Year Quarter Rank Headquarters \\\n",
"0 None 2017 Q1 3 United_States \n",
"1 None 2017 Q1 1 United_States \n",
"2 None 2017 Q1 4 United_States \n",
"3 (#cite_note-ya-15) 2017 Q1 2 United_States \n",
"4 (#cite_note-yb-16) 2017 Q1 5 United_States \n",
"\n",
" Headquarters Link Primary industry \n",
"0 (https://en.wikipedia.org/wiki/United_States) None \n",
"1 (https://en.wikipedia.org/wiki/United_States) None \n",
"2 (https://en.wikipedia.org/wiki/United_States) None \n",
"3 (https://en.wikipedia.org/wiki/United_States) None \n",
"4 (https://en.wikipedia.org/wiki/United_States) None "
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# parse properly second CSV\n",
"second = \"raw/Data - List of public corporations by market capitalization - 2008-2017.csv\"\n",
"df = pd.read_csv(second)\n",
"\n",
"df.dropna(how=\"all\", inplace=True) # drop empty lines\n",
"\n",
"print df.shape\n",
"\n",
"col_names = [\n",
" ('Q1', 'Q1 (Country)'),\n",
" ('Q2', 'Q2 (Country)'),\n",
" ('Q3', 'Q3 (Country'),\n",
" ('Q4', 'Q4 (Country)')\n",
"]\n",
"\n",
"final_second = pd.DataFrame()\n",
"\n",
"for cols in col_names:\n",
" \n",
" _id = df[\"Year\"].astype(int).map(str) + \"-\" + cols[0]+\"-\"+df[\"Rank\"].astype(int).map(str)\n",
" \n",
" q = df[cols[0]].str.split(' ', expand=True)\n",
" q.columns = [\"Company\", \"Company Link\", \"Market value\", \"Notes\"]\n",
" \n",
" result = pd.concat([_id, q], axis=1, join='inner')\n",
" \n",
" result.columns.values[0] = 'Id'\n",
" result[\"Year\"] = df[\"Year\"].astype(int)\n",
" result[\"Quarter\"]= cols[0] # add quarter\n",
" result[\"Rank\"] = df[\"Rank\"].astype(int)\n",
" \n",
" result[\"Headquarters\"] = df[cols[1]].str[1:-1].str.split('/', expand=True)[4]\n",
" result[\"Headquarters Link\"] = df[cols[1]]\n",
" \n",
" result[\"Primary industry\"] = None\n",
" \n",
" print result.shape\n",
"\n",
" final_second = final_second.append(result)\n",
"\n",
"print final_second.shape\n",
"final_second.head()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"44 companies\n",
"Fetched info about 44 companies.\n"
]
}
],
"source": [
"# fetch missing values from Wikipedia\n",
"import os\n",
"import json\n",
"import wptools\n",
"from slugify import slugify\n",
"\n",
"# get unique values\n",
"companies = list(final_first.append(final_second)['Company'].unique())\n",
"print \"%s companies\"%len(companies)\n",
"\n",
"# Count files in folder\n",
"# print len(os.listdir('./companies'))\n",
"\n",
"companies_info = {}\n",
"\n",
"for company_name in companies:\n",
" \n",
" if type(company_name) is str :\n",
" \n",
" json_file_name = \"companies/%s.json\"%slugify(company_name.decode('utf-8'))\n",
"\n",
" if not os.path.exists(json_file_name): \n",
" print \"fetching %s...\"%company_name\n",
" # fetch infobox\n",
" page = wptools.page(company_name).get_parse()\n",
" infobox = page.data['infobox']\n",
"\n",
" #save infobox as json file\n",
"\n",
" print json_file_name\n",
" with open(json_file_name, 'w') as outfile:\n",
" json.dump(dict(infobox), outfile)\n",
" print \"saved to %s\"%json_file_name\n",
" print \n",
" else : \n",
" with open(json_file_name, 'r') as outfile:\n",
" companies_info[company_name] = json.load(outfile)\n",
"\n",
"print \"Fetched info about %s companies.\"%len(companies_info.keys())"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Industries parsed.\n"
]
}
],
"source": [
"# sort by categories\n",
"\n",
"# Plateformes numériques : Conglomerate (Alphabet + Tencent) Internet (alibaba), Facebook, Amazon, Apple (>juillet 2008), Microsoft (>juin 2009)\n",
"# Technologie de l’information : Cloud computing (IBM), Computer software, Computer hardware, Consumer electronics, Apple (<juillet 2008), Microsoft (<juin 2009)\n",
"# Banque & finance: Banking, Conglomerate (Berskshire), Financial services\n",
"# Energie : Energy industry, Oil and gas, \n",
"# Médical & Pharma : Pharmaceutical industry, Medical equipment\n",
"# Télécom : Telecommunications\n",
"# Distribution : Retail, Online shopping, Fast-moving consumer goods (1 seul : Procter & Gamble)\n",
"# Equipements et resources : Conglomerate (GE), Automotive industry, Food processing, Metals,\n",
"\n",
"\n",
"# parse industry for each result\n",
"industries = {}\n",
"\n",
"french_names = {\n",
" \"Plateformes numériques\" : [\"Internet\", \"Cloud computing\", ],\n",
" \"Technologie de l’information\" : [\"Computer software\", \"Computer hardware\", \"Consumer electronics\"],\n",
" \"Banque & Finance\" : [\"Bank\", \"Banking\", \"Financial services\"],\n",
" \"Energie\" : [\"Energy industry\", \"Oil and gas\", \"Petroleum industry\", \"List of petroleum companies\", \"Oil and gas industry\"],\n",
" \"Médical & Pharma\" : [\"Pharmaceutical industry\", \"Medical equipment\"],\n",
" \"Distribution\" : [\"Retail\", \"Online shopping\", \"Fast-moving consumer goods\"],\n",
" \"Télécom\" : [\"Telecommunication\", \"Telecommunications\"],\n",
" \"Equipements & Ressources\" : [\"Automotive industry\", \"Food processing\", \"Metals\"]\n",
"}\n",
"\n",
"translations = {}\n",
"for fr in french_names: \n",
" for name in french_names[fr] :\n",
" translations[name] = fr \n",
"# print translations\n",
" \n",
"for company_name in companies_info :\n",
" info = companies_info[company_name] \n",
" s = info[u'industry']\n",
" industry = s[s.find(\"[[\")+1:s.find(\"]]\")][1:].split(\"|\")[0]\n",
" \n",
" if industry == \"Conglomerate (company)\" or industry == \"Conglomerate (company)\":\n",
" if company_name == \"Berkshire Hathaway\":\n",
" industries[company_name] = \"Banque & Finance\"\n",
" elif company_name == \"General Electric\":\n",
" industries[company_name] = \"Equipements & Ressources\"\n",
" elif company_name == \"Alphabet Inc.\":\n",
" industries[company_name] = \"Plateformes numériques\"\n",
" elif company_name == \"Tencent\":\n",
" industries[company_name] = \"Plateformes numériques\" \n",
" else :\n",
" if company_name == \"Amazon.com\": \n",
" industries[company_name] = \"Plateformes numériques\"\n",
" else :\n",
" industries[company_name] = translations[industry]\n",
"# print company_name\n",
"\n",
"# print industries.keys()\n",
"print 'Industries parsed.'\n",
"# print industries"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(480, 11)\n",
"results saved to \"./final.csv\"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Company</th>\n",
" <th>Company Link</th>\n",
" <th>Headquarters</th>\n",
" <th>Id</th>\n",
" <th>Market value</th>\n",
" <th>Primary industry</th>\n",
" <th>Quarter</th>\n",
" <th>Rank</th>\n",
" <th>Year</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Petrochina</td>\n",
" <td>https://en.wikipedia.org/wiki/Petrochina</td>\n",
" <td>China</td>\n",
" <td>2007-Q4-1</td>\n",
" <td>723,952</td>\n",
" <td>Energie</td>\n",
" <td>Q4</td>\n",
" <td>1</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Exxon Mobil</td>\n",
" <td>https://en.wikipedia.org/wiki/Exxon_Mobil</td>\n",
" <td>United States</td>\n",
" <td>2007-Q4-2</td>\n",
" <td>511,887</td>\n",
" <td>Energie</td>\n",
" <td>Q4</td>\n",
" <td>2</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>General Electric</td>\n",
" <td>https://en.wikipedia.org/wiki/General_Electric</td>\n",
" <td>United States</td>\n",
" <td>2007-Q4-3</td>\n",
" <td>374,637</td>\n",
" <td>Equipements &amp; Ressources</td>\n",
" <td>Q4</td>\n",
" <td>3</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>China Mobile</td>\n",
" <td>https://en.wikipedia.org/wiki/China_Mobile</td>\n",
" <td>Hong Kong</td>\n",
" <td>2007-Q4-4</td>\n",
" <td>354,120</td>\n",
" <td>Télécom</td>\n",
" <td>Q4</td>\n",
" <td>4</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Industrial and Commercial Bank of China</td>\n",
" <td>https://en.wikipedia.org/wiki/Industrial_and_C...</td>\n",
" <td>China</td>\n",
" <td>2007-Q4-5</td>\n",
" <td>338,989</td>\n",
" <td>Banque &amp; Finance</td>\n",
" <td>Q4</td>\n",
" <td>5</td>\n",
" <td>2007</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Company \\\n",
"0 Petrochina \n",
"1 Exxon Mobil \n",
"2 General Electric \n",
"3 China Mobile \n",
"4 Industrial and Commercial Bank of China \n",
"\n",
" Company Link Headquarters \\\n",
"0 https://en.wikipedia.org/wiki/Petrochina China \n",
"1 https://en.wikipedia.org/wiki/Exxon_Mobil United States \n",
"2 https://en.wikipedia.org/wiki/General_Electric United States \n",
"3 https://en.wikipedia.org/wiki/China_Mobile Hong Kong \n",
"4 https://en.wikipedia.org/wiki/Industrial_and_C... China \n",
"\n",
" Id Market value Primary industry Quarter Rank Year \n",
"0 2007-Q4-1 723,952 Energie Q4 1 2007 \n",
"1 2007-Q4-2 511,887 Energie Q4 2 2007 \n",
"2 2007-Q4-3 374,637 Equipements & Ressources Q4 3 2007 \n",
"3 2007-Q4-4 354,120 Télécom Q4 4 2007 \n",
"4 2007-Q4-5 338,989 Banque & Finance Q4 5 2007 "
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# merge both\n",
"final = final_first.append(final_second)\n",
"print final.shape\n",
"\n",
"# remap all values using updated data\n",
"final[\"Primary industry\"] = final[\"Company\"]\n",
"final = final.replace({ \"Primary industry\" : industries })\n",
"# print final.replace({ \"Primary industry\" : industries })\n",
"\n",
"# move Apple and Microsoft after 2016\n",
"final.ix[ (final['Company'] == \"Apple Inc.\") & (final['Year'] > 2008), \"Primary industry\"] = \"Plateformes numériques\" \n",
"final.ix[(final['Company'] == \"Microsoft\") & (final['Year'] > 2008), \"Primary industry\"] = \"Plateformes numériques\"\n",
"\n",
"# some final clean\n",
"final = final.drop([\"Notes\", \"Headquarters Link\"], axis=1)\n",
"final[\"Market value\"] = final[\"Market value\"].str.split('[', expand=True)[0] # remove\n",
"final[\"Company Link\"] = final[\"Company Link\"].str[1:-1]\n",
"# final[\"Primary industry\"] = final[\"Primary industry\"].str.split(' ', expand=True)[0] # remove\n",
"final.head()\n",
"\n",
"# save as file\n",
"final.to_csv(\"Data - List of public corporations by market capitalization - FINAL.csv\", index=False)\n",
"print 'results saved to \"./final.csv\"'\n",
"\n",
"final.head()"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"results saved.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>2006-Q1</th>\n",
" <th>2006-Q2</th>\n",
" <th>2006-Q3</th>\n",
" <th>2006-Q4</th>\n",
" <th>2007-Q1</th>\n",
" <th>2007-Q2</th>\n",
" <th>2007-Q3</th>\n",
" <th>2007-Q4</th>\n",
" <th>2008-Q1</th>\n",
" <th>2008-Q2</th>\n",
" <th>...</th>\n",
" <th>2015-Q3</th>\n",
" <th>2015-Q4</th>\n",
" <th>2016-Q1</th>\n",
" <th>2016-Q2</th>\n",
" <th>2016-Q3</th>\n",
" <th>2016-Q4</th>\n",
" <th>2017-Q1</th>\n",
" <th>2017-Q2</th>\n",
" <th>2017-Q3</th>\n",
" <th>2017-Q4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Banque &amp; Finance</th>\n",
" <td>450641.0</td>\n",
" <td>661220.0</td>\n",
" <td>698952.0</td>\n",
" <td>768041.0</td>\n",
" <td>705822.0</td>\n",
" <td>470666.0</td>\n",
" <td>734497.0</td>\n",
" <td>338989.0</td>\n",
" <td>277236.0</td>\n",
" <td>439191.1</td>\n",
" <td>...</td>\n",
" <td>318180.0</td>\n",
" <td>850646.0</td>\n",
" <td>595775.0</td>\n",
" <td>345860.0</td>\n",
" <td>358300.0</td>\n",
" <td>989937.54</td>\n",
" <td>1003157.0</td>\n",
" <td>418880.0</td>\n",
" <td>451840.0</td>\n",
" <td>860540.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Distribution</th>\n",
" <td>196860.0</td>\n",
" <td>200762.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>215640.0</td>\n",
" <td>188752.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Energie</th>\n",
" <td>1012510.0</td>\n",
" <td>1075604.0</td>\n",
" <td>1085531.3</td>\n",
" <td>1162849.0</td>\n",
" <td>889496.0</td>\n",
" <td>1215912.0</td>\n",
" <td>1038008.0</td>\n",
" <td>2084619.0</td>\n",
" <td>1396375.0</td>\n",
" <td>1128235.9</td>\n",
" <td>...</td>\n",
" <td>304245.0</td>\n",
" <td>325167.0</td>\n",
" <td>350991.0</td>\n",
" <td>383396.0</td>\n",
" <td>358519.0</td>\n",
" <td>374280.00</td>\n",
" <td>339897.0</td>\n",
" <td>341947.0</td>\n",
" <td>348248.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Equipements &amp; Ressources</th>\n",
" <td>559258.0</td>\n",
" <td>342731.0</td>\n",
" <td>364414.0</td>\n",
" <td>624725.0</td>\n",
" <td>594443.0</td>\n",
" <td>621840.0</td>\n",
" <td>424191.0</td>\n",
" <td>374637.0</td>\n",
" <td>369569.0</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>481430.0</td>\n",
" <td>313892.0</td>\n",
" <td>295546.0</td>\n",
" <td>280927.0</td>\n",
" <td>261876.0</td>\n",
" <td>279545.92</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Médical &amp; Pharma</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>206785.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>156515.9</td>\n",
" <td>...</td>\n",
" <td>498010.0</td>\n",
" <td>287153.0</td>\n",
" <td>300604.0</td>\n",
" <td>328234.0</td>\n",
" <td>320836.0</td>\n",
" <td>313432.46</td>\n",
" <td>337947.0</td>\n",
" <td>357310.0</td>\n",
" <td>347497.0</td>\n",
" <td>375360.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 48 columns</p>\n",
"</div>"
],
"text/plain": [
" 2006-Q1 2006-Q2 2006-Q3 2006-Q4 \\\n",
"Banque & Finance 450641.0 661220.0 698952.0 768041.0 \n",
"Distribution 196860.0 200762.0 NaN NaN \n",
"Energie 1012510.0 1075604.0 1085531.3 1162849.0 \n",
"Equipements & Ressources 559258.0 342731.0 364414.0 624725.0 \n",
"Médical & Pharma NaN NaN 206785.0 NaN \n",
"\n",
" 2007-Q1 2007-Q2 2007-Q3 2007-Q4 \\\n",
"Banque & Finance 705822.0 470666.0 734497.0 338989.0 \n",
"Distribution NaN NaN NaN NaN \n",
"Energie 889496.0 1215912.0 1038008.0 2084619.0 \n",
"Equipements & Ressources 594443.0 621840.0 424191.0 374637.0 \n",
"Médical & Pharma NaN NaN NaN NaN \n",
"\n",
" 2008-Q1 2008-Q2 ... 2015-Q3 2015-Q4 \\\n",
"Banque & Finance 277236.0 439191.1 ... 318180.0 850646.0 \n",
"Distribution 215640.0 188752.0 ... NaN NaN \n",
"Energie 1396375.0 1128235.9 ... 304245.0 325167.0 \n",
"Equipements & Ressources 369569.0 NaN ... 481430.0 313892.0 \n",
"Médical & Pharma NaN 156515.9 ... 498010.0 287153.0 \n",
"\n",
" 2016-Q1 2016-Q2 2016-Q3 2016-Q4 2017-Q1 \\\n",
"Banque & Finance 595775.0 345860.0 358300.0 989937.54 1003157.0 \n",
"Distribution NaN NaN NaN NaN NaN \n",
"Energie 350991.0 383396.0 358519.0 374280.00 339897.0 \n",
"Equipements & Ressources 295546.0 280927.0 261876.0 279545.92 NaN \n",
"Médical & Pharma 300604.0 328234.0 320836.0 313432.46 337947.0 \n",
"\n",
" 2017-Q2 2017-Q3 2017-Q4 \n",
"Banque & Finance 418880.0 451840.0 860540.0 \n",
"Distribution NaN NaN NaN \n",
"Energie 341947.0 348248.0 NaN \n",
"Equipements & Ressources NaN NaN NaN \n",
"Médical & Pharma 357310.0 347497.0 375360.0 \n",
"\n",
"[5 rows x 48 columns]"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sort result by year/quarter/industry\n",
"\n",
"results_by_quarter = {}\n",
"\n",
"# get all results by quarter\n",
"for index, row in final.sort_values(\"Id\").iterrows():\n",
" \n",
" quarter = \"%s-%s\"%(row[\"Year\"], row[\"Quarter\"])\n",
" \n",
" try :\n",
" results_by_quarter[quarter].append((row[\"Primary industry\"], row[\"Market value\"]))\n",
" except : \n",
" results_by_quarter[quarter] = [ (row[\"Primary industry\"], row[\"Market value\"]) ]\n",
"\n",
"# sum up results by industry \n",
"final_results = {}\n",
"\n",
"for q in results_by_quarter:\n",
" q_results = {} \n",
" for res in results_by_quarter[q]:\n",
" if res[1] is not None: \n",
" industry = res[0]\n",
" amount = float(''.join(res[1].split(\",\")))\n",
" \n",
" try : \n",
" q_results[industry] = q_results[industry]+amount\n",
" except KeyError:\n",
" q_results[industry] = amount\n",
" \n",
" final_results[q] = q_results\n",
" # make percent\n",
"# total = sum([q_results[qr] for qr in q_results])\n",
"# final_results[q] = { qr : q_results[qr]/total*100 for qr in q_results}\n",
"\n",
"df = pd.DataFrame(final_results)\n",
"df.to_csv(\"Data - Market capitalization by Industries.csv\")\n",
"print 'results saved.'\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment