Last active
March 31, 2018 09:28
-
-
Save clemsos/de225e4b683a4643d01e8f50f5fa050b to your computer and use it in GitHub Desktop.
Data Parser - List of public corporations by market capitalization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import csv\n", | |
"import pandas as pd\n", | |
"\n", | |
"# Clean data\n", | |
"#\n", | |
"# From https://en.wikipedia.org/wiki/List_of_public_corporations_by_market_capitalization\n", | |
"# Data Source : https://docs.google.com/spreadsheets/d/1NxEouZb0AHGZ3OcwUPp8Ga1LmBjp37NABtwdcYSE91I/edit#gid=115560802\n", | |
"\n", | |
"# Visualization\n", | |
"# http://blockbuilder.org/clemsos/4923d7601f5e4b208b740123679047f7" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(80, 10)\n" | |
] | |
} | |
], | |
"source": [ | |
"# parse fist CSV\n", | |
"first = \"raw/Data - List of public corporations by market capitalization - 2007-2006.csv\"\n", | |
"\n", | |
"df = pd.read_csv(first)\n", | |
"df.dropna(how=\"all\", inplace=True) # drop empty lines\n", | |
"\n", | |
"_id = df[\"Year\"].astype(int).map(str) + \"-\" + df[\"Q\"]+\"-\"+df[\"Rank\"].astype(int).map(str)\n", | |
"q = df[\"Name\"].str.split(' ', expand=True)\n", | |
"final_first = pd.concat([_id, q, df], axis=1, join='inner')\n", | |
"final_first = final_first.drop(['Name'], axis=1)\n", | |
"\n", | |
"# rename columns\n", | |
"final_first.columns.values[0] = 'Id'\n", | |
"final_first.columns.values[1] = 'Company'\n", | |
"final_first.columns.values[2] = 'Company Link'\n", | |
"final_first = final_first.rename(index=str, columns={\"Q\": \"Quarter\"})\n", | |
"final_first[\"Headquarters Link\"] = None\n", | |
"\n", | |
"final_first.head()\n", | |
"\n", | |
"print final_first.shape\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(100, 10)\n", | |
"(100, 11)\n", | |
"(100, 11)\n", | |
"(100, 11)\n", | |
"(100, 11)\n", | |
"(400, 11)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Id</th>\n", | |
" <th>Company</th>\n", | |
" <th>Company Link</th>\n", | |
" <th>Market value</th>\n", | |
" <th>Notes</th>\n", | |
" <th>Year</th>\n", | |
" <th>Quarter</th>\n", | |
" <th>Rank</th>\n", | |
" <th>Headquarters</th>\n", | |
" <th>Headquarters Link</th>\n", | |
" <th>Primary industry</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2017-Q1-3</td>\n", | |
" <td>Microsoft</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/Microsoft)</td>\n", | |
" <td>508,935</td>\n", | |
" <td>None</td>\n", | |
" <td>2017</td>\n", | |
" <td>Q1</td>\n", | |
" <td>3</td>\n", | |
" <td>United_States</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2017-Q1-1</td>\n", | |
" <td>Apple Inc.</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/Apple_Inc.)</td>\n", | |
" <td>753,718</td>\n", | |
" <td>None</td>\n", | |
" <td>2017</td>\n", | |
" <td>Q1</td>\n", | |
" <td>1</td>\n", | |
" <td>United_States</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2017-Q1-4</td>\n", | |
" <td>Amazon.com</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/Amazon.com)</td>\n", | |
" <td>423,031</td>\n", | |
" <td>None</td>\n", | |
" <td>2017</td>\n", | |
" <td>Q1</td>\n", | |
" <td>4</td>\n", | |
" <td>United_States</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2017-Q1-2</td>\n", | |
" <td>Alphabet Inc.</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/Alphabet_Inc.)</td>\n", | |
" <td>573,570 [14]</td>\n", | |
" <td>(#cite_note-ya-15)</td>\n", | |
" <td>2017</td>\n", | |
" <td>Q1</td>\n", | |
" <td>2</td>\n", | |
" <td>United_States</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2017-Q1-5</td>\n", | |
" <td>Berkshire Hathaway</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/Berkshire_Hatha...</td>\n", | |
" <td>410,880 [15]</td>\n", | |
" <td>(#cite_note-yb-16)</td>\n", | |
" <td>2017</td>\n", | |
" <td>Q1</td>\n", | |
" <td>5</td>\n", | |
" <td>United_States</td>\n", | |
" <td>(https://en.wikipedia.org/wiki/United_States)</td>\n", | |
" <td>None</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Id Company \\\n", | |
"0 2017-Q1-3 Microsoft \n", | |
"1 2017-Q1-1 Apple Inc. \n", | |
"2 2017-Q1-4 Amazon.com \n", | |
"3 2017-Q1-2 Alphabet Inc. \n", | |
"4 2017-Q1-5 Berkshire Hathaway \n", | |
"\n", | |
" Company Link Market value \\\n", | |
"0 (https://en.wikipedia.org/wiki/Microsoft) 508,935 \n", | |
"1 (https://en.wikipedia.org/wiki/Apple_Inc.) 753,718 \n", | |
"2 (https://en.wikipedia.org/wiki/Amazon.com) 423,031 \n", | |
"3 (https://en.wikipedia.org/wiki/Alphabet_Inc.) 573,570 [14] \n", | |
"4 (https://en.wikipedia.org/wiki/Berkshire_Hatha... 410,880 [15] \n", | |
"\n", | |
" Notes Year Quarter Rank Headquarters \\\n", | |
"0 None 2017 Q1 3 United_States \n", | |
"1 None 2017 Q1 1 United_States \n", | |
"2 None 2017 Q1 4 United_States \n", | |
"3 (#cite_note-ya-15) 2017 Q1 2 United_States \n", | |
"4 (#cite_note-yb-16) 2017 Q1 5 United_States \n", | |
"\n", | |
" Headquarters Link Primary industry \n", | |
"0 (https://en.wikipedia.org/wiki/United_States) None \n", | |
"1 (https://en.wikipedia.org/wiki/United_States) None \n", | |
"2 (https://en.wikipedia.org/wiki/United_States) None \n", | |
"3 (https://en.wikipedia.org/wiki/United_States) None \n", | |
"4 (https://en.wikipedia.org/wiki/United_States) None " | |
] | |
}, | |
"execution_count": 75, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# parse properly second CSV\n", | |
"second = \"raw/Data - List of public corporations by market capitalization - 2008-2017.csv\"\n", | |
"df = pd.read_csv(second)\n", | |
"\n", | |
"df.dropna(how=\"all\", inplace=True) # drop empty lines\n", | |
"\n", | |
"print df.shape\n", | |
"\n", | |
"col_names = [\n", | |
" ('Q1', 'Q1 (Country)'),\n", | |
" ('Q2', 'Q2 (Country)'),\n", | |
" ('Q3', 'Q3 (Country'),\n", | |
" ('Q4', 'Q4 (Country)')\n", | |
"]\n", | |
"\n", | |
"final_second = pd.DataFrame()\n", | |
"\n", | |
"for cols in col_names:\n", | |
" \n", | |
" _id = df[\"Year\"].astype(int).map(str) + \"-\" + cols[0]+\"-\"+df[\"Rank\"].astype(int).map(str)\n", | |
" \n", | |
" q = df[cols[0]].str.split(' ', expand=True)\n", | |
" q.columns = [\"Company\", \"Company Link\", \"Market value\", \"Notes\"]\n", | |
" \n", | |
" result = pd.concat([_id, q], axis=1, join='inner')\n", | |
" \n", | |
" result.columns.values[0] = 'Id'\n", | |
" result[\"Year\"] = df[\"Year\"].astype(int)\n", | |
" result[\"Quarter\"]= cols[0] # add quarter\n", | |
" result[\"Rank\"] = df[\"Rank\"].astype(int)\n", | |
" \n", | |
" result[\"Headquarters\"] = df[cols[1]].str[1:-1].str.split('/', expand=True)[4]\n", | |
" result[\"Headquarters Link\"] = df[cols[1]]\n", | |
" \n", | |
" result[\"Primary industry\"] = None\n", | |
" \n", | |
" print result.shape\n", | |
"\n", | |
" final_second = final_second.append(result)\n", | |
"\n", | |
"print final_second.shape\n", | |
"final_second.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"44 companies\n", | |
"Fetched info about 44 companies.\n" | |
] | |
} | |
], | |
"source": [ | |
"# fetch missing values from Wikipedia\n", | |
"import os\n", | |
"import json\n", | |
"import wptools\n", | |
"from slugify import slugify\n", | |
"\n", | |
"# get unique values\n", | |
"companies = list(final_first.append(final_second)['Company'].unique())\n", | |
"print \"%s companies\"%len(companies)\n", | |
"\n", | |
"# Count files in folder\n", | |
"# print len(os.listdir('./companies'))\n", | |
"\n", | |
"companies_info = {}\n", | |
"\n", | |
"for company_name in companies:\n", | |
" \n", | |
" if type(company_name) is str :\n", | |
" \n", | |
" json_file_name = \"companies/%s.json\"%slugify(company_name.decode('utf-8'))\n", | |
"\n", | |
" if not os.path.exists(json_file_name): \n", | |
" print \"fetching %s...\"%company_name\n", | |
" # fetch infobox\n", | |
" page = wptools.page(company_name).get_parse()\n", | |
" infobox = page.data['infobox']\n", | |
"\n", | |
" #save infobox as json file\n", | |
"\n", | |
" print json_file_name\n", | |
" with open(json_file_name, 'w') as outfile:\n", | |
" json.dump(dict(infobox), outfile)\n", | |
" print \"saved to %s\"%json_file_name\n", | |
" print \n", | |
" else : \n", | |
" with open(json_file_name, 'r') as outfile:\n", | |
" companies_info[company_name] = json.load(outfile)\n", | |
"\n", | |
"print \"Fetched info about %s companies.\"%len(companies_info.keys())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Industries parsed.\n" | |
] | |
} | |
], | |
"source": [ | |
"# sort by categories\n", | |
"\n", | |
"# Plateformes numériques : Conglomerate (Alphabet + Tencent) Internet (alibaba), Facebook, Amazon, Apple (>juillet 2008), Microsoft (>juin 2009)\n", | |
"# Technologie de l’information : Cloud computing (IBM), Computer software, Computer hardware, Consumer electronics, Apple (<juillet 2008), Microsoft (<juin 2009)\n", | |
"# Banque & finance: Banking, Conglomerate (Berskshire), Financial services\n", | |
"# Energie : Energy industry, Oil and gas, \n", | |
"# Médical & Pharma : Pharmaceutical industry, Medical equipment\n", | |
"# Télécom : Telecommunications\n", | |
"# Distribution : Retail, Online shopping, Fast-moving consumer goods (1 seul : Procter & Gamble)\n", | |
"# Equipements et resources : Conglomerate (GE), Automotive industry, Food processing, Metals,\n", | |
"\n", | |
"\n", | |
"# parse industry for each result\n", | |
"industries = {}\n", | |
"\n", | |
"french_names = {\n", | |
" \"Plateformes numériques\" : [\"Internet\", \"Cloud computing\", ],\n", | |
" \"Technologie de l’information\" : [\"Computer software\", \"Computer hardware\", \"Consumer electronics\"],\n", | |
" \"Banque & Finance\" : [\"Bank\", \"Banking\", \"Financial services\"],\n", | |
" \"Energie\" : [\"Energy industry\", \"Oil and gas\", \"Petroleum industry\", \"List of petroleum companies\", \"Oil and gas industry\"],\n", | |
" \"Médical & Pharma\" : [\"Pharmaceutical industry\", \"Medical equipment\"],\n", | |
" \"Distribution\" : [\"Retail\", \"Online shopping\", \"Fast-moving consumer goods\"],\n", | |
" \"Télécom\" : [\"Telecommunication\", \"Telecommunications\"],\n", | |
" \"Equipements & Ressources\" : [\"Automotive industry\", \"Food processing\", \"Metals\"]\n", | |
"}\n", | |
"\n", | |
"translations = {}\n", | |
"for fr in french_names: \n", | |
" for name in french_names[fr] :\n", | |
" translations[name] = fr \n", | |
"# print translations\n", | |
" \n", | |
"for company_name in companies_info :\n", | |
" info = companies_info[company_name] \n", | |
" s = info[u'industry']\n", | |
" industry = s[s.find(\"[[\")+1:s.find(\"]]\")][1:].split(\"|\")[0]\n", | |
" \n", | |
" if industry == \"Conglomerate (company)\" or industry == \"Conglomerate (company)\":\n", | |
" if company_name == \"Berkshire Hathaway\":\n", | |
" industries[company_name] = \"Banque & Finance\"\n", | |
" elif company_name == \"General Electric\":\n", | |
" industries[company_name] = \"Equipements & Ressources\"\n", | |
" elif company_name == \"Alphabet Inc.\":\n", | |
" industries[company_name] = \"Plateformes numériques\"\n", | |
" elif company_name == \"Tencent\":\n", | |
" industries[company_name] = \"Plateformes numériques\" \n", | |
" else :\n", | |
" if company_name == \"Amazon.com\": \n", | |
" industries[company_name] = \"Plateformes numériques\"\n", | |
" else :\n", | |
" industries[company_name] = translations[industry]\n", | |
"# print company_name\n", | |
"\n", | |
"# print industries.keys()\n", | |
"print 'Industries parsed.'\n", | |
"# print industries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 78, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(480, 11)\n", | |
"results saved to \"./final.csv\"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Company</th>\n", | |
" <th>Company Link</th>\n", | |
" <th>Headquarters</th>\n", | |
" <th>Id</th>\n", | |
" <th>Market value</th>\n", | |
" <th>Primary industry</th>\n", | |
" <th>Quarter</th>\n", | |
" <th>Rank</th>\n", | |
" <th>Year</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Petrochina</td>\n", | |
" <td>https://en.wikipedia.org/wiki/Petrochina</td>\n", | |
" <td>China</td>\n", | |
" <td>2007-Q4-1</td>\n", | |
" <td>723,952</td>\n", | |
" <td>Energie</td>\n", | |
" <td>Q4</td>\n", | |
" <td>1</td>\n", | |
" <td>2007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Exxon Mobil</td>\n", | |
" <td>https://en.wikipedia.org/wiki/Exxon_Mobil</td>\n", | |
" <td>United States</td>\n", | |
" <td>2007-Q4-2</td>\n", | |
" <td>511,887</td>\n", | |
" <td>Energie</td>\n", | |
" <td>Q4</td>\n", | |
" <td>2</td>\n", | |
" <td>2007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>General Electric</td>\n", | |
" <td>https://en.wikipedia.org/wiki/General_Electric</td>\n", | |
" <td>United States</td>\n", | |
" <td>2007-Q4-3</td>\n", | |
" <td>374,637</td>\n", | |
" <td>Equipements & Ressources</td>\n", | |
" <td>Q4</td>\n", | |
" <td>3</td>\n", | |
" <td>2007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>China Mobile</td>\n", | |
" <td>https://en.wikipedia.org/wiki/China_Mobile</td>\n", | |
" <td>Hong Kong</td>\n", | |
" <td>2007-Q4-4</td>\n", | |
" <td>354,120</td>\n", | |
" <td>Télécom</td>\n", | |
" <td>Q4</td>\n", | |
" <td>4</td>\n", | |
" <td>2007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Industrial and Commercial Bank of China</td>\n", | |
" <td>https://en.wikipedia.org/wiki/Industrial_and_C...</td>\n", | |
" <td>China</td>\n", | |
" <td>2007-Q4-5</td>\n", | |
" <td>338,989</td>\n", | |
" <td>Banque & Finance</td>\n", | |
" <td>Q4</td>\n", | |
" <td>5</td>\n", | |
" <td>2007</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Company \\\n", | |
"0 Petrochina \n", | |
"1 Exxon Mobil \n", | |
"2 General Electric \n", | |
"3 China Mobile \n", | |
"4 Industrial and Commercial Bank of China \n", | |
"\n", | |
" Company Link Headquarters \\\n", | |
"0 https://en.wikipedia.org/wiki/Petrochina China \n", | |
"1 https://en.wikipedia.org/wiki/Exxon_Mobil United States \n", | |
"2 https://en.wikipedia.org/wiki/General_Electric United States \n", | |
"3 https://en.wikipedia.org/wiki/China_Mobile Hong Kong \n", | |
"4 https://en.wikipedia.org/wiki/Industrial_and_C... China \n", | |
"\n", | |
" Id Market value Primary industry Quarter Rank Year \n", | |
"0 2007-Q4-1 723,952 Energie Q4 1 2007 \n", | |
"1 2007-Q4-2 511,887 Energie Q4 2 2007 \n", | |
"2 2007-Q4-3 374,637 Equipements & Ressources Q4 3 2007 \n", | |
"3 2007-Q4-4 354,120 Télécom Q4 4 2007 \n", | |
"4 2007-Q4-5 338,989 Banque & Finance Q4 5 2007 " | |
] | |
}, | |
"execution_count": 78, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# merge both\n", | |
"final = final_first.append(final_second)\n", | |
"print final.shape\n", | |
"\n", | |
"# remap all values using updated data\n", | |
"final[\"Primary industry\"] = final[\"Company\"]\n", | |
"final = final.replace({ \"Primary industry\" : industries })\n", | |
"# print final.replace({ \"Primary industry\" : industries })\n", | |
"\n", | |
"# move Apple and Microsoft after 2016\n", | |
"final.ix[ (final['Company'] == \"Apple Inc.\") & (final['Year'] > 2008), \"Primary industry\"] = \"Plateformes numériques\" \n", | |
"final.ix[(final['Company'] == \"Microsoft\") & (final['Year'] > 2008), \"Primary industry\"] = \"Plateformes numériques\"\n", | |
"\n", | |
"# some final clean\n", | |
"final = final.drop([\"Notes\", \"Headquarters Link\"], axis=1)\n", | |
"final[\"Market value\"] = final[\"Market value\"].str.split('[', expand=True)[0] # remove\n", | |
"final[\"Company Link\"] = final[\"Company Link\"].str[1:-1]\n", | |
"# final[\"Primary industry\"] = final[\"Primary industry\"].str.split(' ', expand=True)[0] # remove\n", | |
"final.head()\n", | |
"\n", | |
"# save as file\n", | |
"final.to_csv(\"Data - List of public corporations by market capitalization - FINAL.csv\", index=False)\n", | |
"print 'results saved to \"./final.csv\"'\n", | |
"\n", | |
"final.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 79, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"results saved.\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>2006-Q1</th>\n", | |
" <th>2006-Q2</th>\n", | |
" <th>2006-Q3</th>\n", | |
" <th>2006-Q4</th>\n", | |
" <th>2007-Q1</th>\n", | |
" <th>2007-Q2</th>\n", | |
" <th>2007-Q3</th>\n", | |
" <th>2007-Q4</th>\n", | |
" <th>2008-Q1</th>\n", | |
" <th>2008-Q2</th>\n", | |
" <th>...</th>\n", | |
" <th>2015-Q3</th>\n", | |
" <th>2015-Q4</th>\n", | |
" <th>2016-Q1</th>\n", | |
" <th>2016-Q2</th>\n", | |
" <th>2016-Q3</th>\n", | |
" <th>2016-Q4</th>\n", | |
" <th>2017-Q1</th>\n", | |
" <th>2017-Q2</th>\n", | |
" <th>2017-Q3</th>\n", | |
" <th>2017-Q4</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>Banque & Finance</th>\n", | |
" <td>450641.0</td>\n", | |
" <td>661220.0</td>\n", | |
" <td>698952.0</td>\n", | |
" <td>768041.0</td>\n", | |
" <td>705822.0</td>\n", | |
" <td>470666.0</td>\n", | |
" <td>734497.0</td>\n", | |
" <td>338989.0</td>\n", | |
" <td>277236.0</td>\n", | |
" <td>439191.1</td>\n", | |
" <td>...</td>\n", | |
" <td>318180.0</td>\n", | |
" <td>850646.0</td>\n", | |
" <td>595775.0</td>\n", | |
" <td>345860.0</td>\n", | |
" <td>358300.0</td>\n", | |
" <td>989937.54</td>\n", | |
" <td>1003157.0</td>\n", | |
" <td>418880.0</td>\n", | |
" <td>451840.0</td>\n", | |
" <td>860540.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Distribution</th>\n", | |
" <td>196860.0</td>\n", | |
" <td>200762.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>215640.0</td>\n", | |
" <td>188752.0</td>\n", | |
" <td>...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Energie</th>\n", | |
" <td>1012510.0</td>\n", | |
" <td>1075604.0</td>\n", | |
" <td>1085531.3</td>\n", | |
" <td>1162849.0</td>\n", | |
" <td>889496.0</td>\n", | |
" <td>1215912.0</td>\n", | |
" <td>1038008.0</td>\n", | |
" <td>2084619.0</td>\n", | |
" <td>1396375.0</td>\n", | |
" <td>1128235.9</td>\n", | |
" <td>...</td>\n", | |
" <td>304245.0</td>\n", | |
" <td>325167.0</td>\n", | |
" <td>350991.0</td>\n", | |
" <td>383396.0</td>\n", | |
" <td>358519.0</td>\n", | |
" <td>374280.00</td>\n", | |
" <td>339897.0</td>\n", | |
" <td>341947.0</td>\n", | |
" <td>348248.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Equipements & Ressources</th>\n", | |
" <td>559258.0</td>\n", | |
" <td>342731.0</td>\n", | |
" <td>364414.0</td>\n", | |
" <td>624725.0</td>\n", | |
" <td>594443.0</td>\n", | |
" <td>621840.0</td>\n", | |
" <td>424191.0</td>\n", | |
" <td>374637.0</td>\n", | |
" <td>369569.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>...</td>\n", | |
" <td>481430.0</td>\n", | |
" <td>313892.0</td>\n", | |
" <td>295546.0</td>\n", | |
" <td>280927.0</td>\n", | |
" <td>261876.0</td>\n", | |
" <td>279545.92</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Médical & Pharma</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>206785.0</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>156515.9</td>\n", | |
" <td>...</td>\n", | |
" <td>498010.0</td>\n", | |
" <td>287153.0</td>\n", | |
" <td>300604.0</td>\n", | |
" <td>328234.0</td>\n", | |
" <td>320836.0</td>\n", | |
" <td>313432.46</td>\n", | |
" <td>337947.0</td>\n", | |
" <td>357310.0</td>\n", | |
" <td>347497.0</td>\n", | |
" <td>375360.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 48 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 2006-Q1 2006-Q2 2006-Q3 2006-Q4 \\\n", | |
"Banque & Finance 450641.0 661220.0 698952.0 768041.0 \n", | |
"Distribution 196860.0 200762.0 NaN NaN \n", | |
"Energie 1012510.0 1075604.0 1085531.3 1162849.0 \n", | |
"Equipements & Ressources 559258.0 342731.0 364414.0 624725.0 \n", | |
"Médical & Pharma NaN NaN 206785.0 NaN \n", | |
"\n", | |
" 2007-Q1 2007-Q2 2007-Q3 2007-Q4 \\\n", | |
"Banque & Finance 705822.0 470666.0 734497.0 338989.0 \n", | |
"Distribution NaN NaN NaN NaN \n", | |
"Energie 889496.0 1215912.0 1038008.0 2084619.0 \n", | |
"Equipements & Ressources 594443.0 621840.0 424191.0 374637.0 \n", | |
"Médical & Pharma NaN NaN NaN NaN \n", | |
"\n", | |
" 2008-Q1 2008-Q2 ... 2015-Q3 2015-Q4 \\\n", | |
"Banque & Finance 277236.0 439191.1 ... 318180.0 850646.0 \n", | |
"Distribution 215640.0 188752.0 ... NaN NaN \n", | |
"Energie 1396375.0 1128235.9 ... 304245.0 325167.0 \n", | |
"Equipements & Ressources 369569.0 NaN ... 481430.0 313892.0 \n", | |
"Médical & Pharma NaN 156515.9 ... 498010.0 287153.0 \n", | |
"\n", | |
" 2016-Q1 2016-Q2 2016-Q3 2016-Q4 2017-Q1 \\\n", | |
"Banque & Finance 595775.0 345860.0 358300.0 989937.54 1003157.0 \n", | |
"Distribution NaN NaN NaN NaN NaN \n", | |
"Energie 350991.0 383396.0 358519.0 374280.00 339897.0 \n", | |
"Equipements & Ressources 295546.0 280927.0 261876.0 279545.92 NaN \n", | |
"Médical & Pharma 300604.0 328234.0 320836.0 313432.46 337947.0 \n", | |
"\n", | |
" 2017-Q2 2017-Q3 2017-Q4 \n", | |
"Banque & Finance 418880.0 451840.0 860540.0 \n", | |
"Distribution NaN NaN NaN \n", | |
"Energie 341947.0 348248.0 NaN \n", | |
"Equipements & Ressources NaN NaN NaN \n", | |
"Médical & Pharma 357310.0 347497.0 375360.0 \n", | |
"\n", | |
"[5 rows x 48 columns]" | |
] | |
}, | |
"execution_count": 79, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# sort result by year/quarter/industry\n", | |
"\n", | |
"results_by_quarter = {}\n", | |
"\n", | |
"# get all results by quarter\n", | |
"for index, row in final.sort_values(\"Id\").iterrows():\n", | |
" \n", | |
" quarter = \"%s-%s\"%(row[\"Year\"], row[\"Quarter\"])\n", | |
" \n", | |
" try :\n", | |
" results_by_quarter[quarter].append((row[\"Primary industry\"], row[\"Market value\"]))\n", | |
" except : \n", | |
" results_by_quarter[quarter] = [ (row[\"Primary industry\"], row[\"Market value\"]) ]\n", | |
"\n", | |
"# sum up results by industry \n", | |
"final_results = {}\n", | |
"\n", | |
"for q in results_by_quarter:\n", | |
" q_results = {} \n", | |
" for res in results_by_quarter[q]:\n", | |
" if res[1] is not None: \n", | |
" industry = res[0]\n", | |
" amount = float(''.join(res[1].split(\",\")))\n", | |
" \n", | |
" try : \n", | |
" q_results[industry] = q_results[industry]+amount\n", | |
" except KeyError:\n", | |
" q_results[industry] = amount\n", | |
" \n", | |
" final_results[q] = q_results\n", | |
" # make percent\n", | |
"# total = sum([q_results[qr] for qr in q_results])\n", | |
"# final_results[q] = { qr : q_results[qr]/total*100 for qr in q_results}\n", | |
"\n", | |
"df = pd.DataFrame(final_results)\n", | |
"df.to_csv(\"Data - Market capitalization by Industries.csv\")\n", | |
"print 'results saved.'\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment