clemsos · March 31, 2018 09:28
diff --git a/Data - List of public corporations by market capitalization.ipynb b/Data - List of public corporations by market capitalization.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "import pandas as pd\n",
    "\n",
    "# Clean data\n",
    "#\n",
    "# From https://en.wikipedia.org/wiki/List_of_public_corporations_by_market_capitalization\n",
    "# Data Source : https://docs.google.com/spreadsheets/d/1NxEouZb0AHGZ3OcwUPp8Ga1LmBjp37NABtwdcYSE91I/edit#gid=115560802\n",
    "\n",
    "# Visualization\n",
    "# http://blockbuilder.org/clemsos/4923d7601f5e4b208b740123679047f7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(80, 10)\n"
     ]
    }
   ],
   "source": [
    "# parse fist CSV\n",
    "first = \"raw/Data - List of public corporations by market capitalization - 2007-2006.csv\"\n",
    "\n",
    "df = pd.read_csv(first)\n",
    "df.dropna(how=\"all\", inplace=True) # drop empty lines\n",
    "\n",
    "_id = df[\"Year\"].astype(int).map(str) + \"-\" + df[\"Q\"]+\"-\"+df[\"Rank\"].astype(int).map(str)\n",
    "q = df[\"Name\"].str.split('  ', expand=True)\n",
    "final_first = pd.concat([_id, q, df], axis=1, join='inner')\n",
    "final_first = final_first.drop(['Name'], axis=1)\n",
    "\n",
    "# rename columns\n",
    "final_first.columns.values[0] = 'Id'\n",
    "final_first.columns.values[1] = 'Company'\n",
    "final_first.columns.values[2] = 'Company Link'\n",
    "final_first = final_first.rename(index=str, columns={\"Q\": \"Quarter\"})\n",
    "final_first[\"Headquarters Link\"] = None\n",
    "\n",
    "final_first.head()\n",
    "\n",
    "print final_first.shape\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100, 10)\n",
      "(100, 11)\n",
      "(100, 11)\n",
      "(100, 11)\n",
      "(100, 11)\n",
      "(400, 11)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>Company</th>\n",
       "      <th>Company Link</th>\n",
       "      <th>Market value</th>\n",
       "      <th>Notes</th>\n",
       "      <th>Year</th>\n",
       "      <th>Quarter</th>\n",
       "      <th>Rank</th>\n",
       "      <th>Headquarters</th>\n",
       "      <th>Headquarters Link</th>\n",
       "      <th>Primary industry</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2017-Q1-3</td>\n",
       "      <td>Microsoft</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/Microsoft)</td>\n",
       "      <td>508,935</td>\n",
       "      <td>None</td>\n",
       "      <td>2017</td>\n",
       "      <td>Q1</td>\n",
       "      <td>3</td>\n",
       "      <td>United_States</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2017-Q1-1</td>\n",
       "      <td>Apple Inc.</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/Apple_Inc.)</td>\n",
       "      <td>753,718</td>\n",
       "      <td>None</td>\n",
       "      <td>2017</td>\n",
       "      <td>Q1</td>\n",
       "      <td>1</td>\n",
       "      <td>United_States</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2017-Q1-4</td>\n",
       "      <td>Amazon.com</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/Amazon.com)</td>\n",
       "      <td>423,031</td>\n",
       "      <td>None</td>\n",
       "      <td>2017</td>\n",
       "      <td>Q1</td>\n",
       "      <td>4</td>\n",
       "      <td>United_States</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2017-Q1-2</td>\n",
       "      <td>Alphabet Inc.</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/Alphabet_Inc.)</td>\n",
       "      <td>573,570 [14]</td>\n",
       "      <td>(#cite_note-ya-15)</td>\n",
       "      <td>2017</td>\n",
       "      <td>Q1</td>\n",
       "      <td>2</td>\n",
       "      <td>United_States</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2017-Q1-5</td>\n",
       "      <td>Berkshire Hathaway</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/Berkshire_Hatha...</td>\n",
       "      <td>410,880 [15]</td>\n",
       "      <td>(#cite_note-yb-16)</td>\n",
       "      <td>2017</td>\n",
       "      <td>Q1</td>\n",
       "      <td>5</td>\n",
       "      <td>United_States</td>\n",
       "      <td>(https://en.wikipedia.org/wiki/United_States)</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          Id             Company  \\\n",
       "0  2017-Q1-3           Microsoft   \n",
       "1  2017-Q1-1          Apple Inc.   \n",
       "2  2017-Q1-4          Amazon.com   \n",
       "3  2017-Q1-2       Alphabet Inc.   \n",
       "4  2017-Q1-5  Berkshire Hathaway   \n",
       "\n",
       "                                        Company Link   Market value  \\\n",
       "0          (https://en.wikipedia.org/wiki/Microsoft)        508,935   \n",
       "1         (https://en.wikipedia.org/wiki/Apple_Inc.)        753,718   \n",
       "2         (https://en.wikipedia.org/wiki/Amazon.com)        423,031   \n",
       "3      (https://en.wikipedia.org/wiki/Alphabet_Inc.)   573,570 [14]   \n",
       "4  (https://en.wikipedia.org/wiki/Berkshire_Hatha...   410,880 [15]   \n",
       "\n",
       "                Notes  Year Quarter  Rank   Headquarters  \\\n",
       "0                None  2017      Q1     3  United_States   \n",
       "1                None  2017      Q1     1  United_States   \n",
       "2                None  2017      Q1     4  United_States   \n",
       "3  (#cite_note-ya-15)  2017      Q1     2  United_States   \n",
       "4  (#cite_note-yb-16)  2017      Q1     5  United_States   \n",
       "\n",
       "                               Headquarters Link Primary industry  \n",
       "0  (https://en.wikipedia.org/wiki/United_States)             None  \n",
       "1  (https://en.wikipedia.org/wiki/United_States)             None  \n",
       "2  (https://en.wikipedia.org/wiki/United_States)             None  \n",
       "3  (https://en.wikipedia.org/wiki/United_States)             None  \n",
       "4  (https://en.wikipedia.org/wiki/United_States)             None  "
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# parse properly second CSV\n",
    "second = \"raw/Data - List of public corporations by market capitalization - 2008-2017.csv\"\n",
    "df = pd.read_csv(second)\n",
    "\n",
    "df.dropna(how=\"all\", inplace=True) # drop empty lines\n",
    "\n",
    "print df.shape\n",
    "\n",
    "col_names = [\n",
    "    ('Q1', 'Q1 (Country)'),\n",
    "    ('Q2', 'Q2 (Country)'),\n",
    "    ('Q3', 'Q3 (Country'),\n",
    "    ('Q4', 'Q4 (Country)')\n",
    "]\n",
    "\n",
    "final_second = pd.DataFrame()\n",
    "\n",
    "for cols in col_names:\n",
    "    \n",
    "    _id = df[\"Year\"].astype(int).map(str) + \"-\" + cols[0]+\"-\"+df[\"Rank\"].astype(int).map(str)\n",
    "    \n",
    "    q = df[cols[0]].str.split('  ', expand=True)\n",
    "    q.columns = [\"Company\", \"Company Link\", \"Market value\", \"Notes\"]\n",
    "    \n",
    "    result = pd.concat([_id, q], axis=1, join='inner')\n",
    "    \n",
    "    result.columns.values[0] = 'Id'\n",
    "    result[\"Year\"] = df[\"Year\"].astype(int)\n",
    "    result[\"Quarter\"]= cols[0] # add quarter\n",
    "    result[\"Rank\"] = df[\"Rank\"].astype(int)\n",
    "    \n",
    "    result[\"Headquarters\"] = df[cols[1]].str[1:-1].str.split('/', expand=True)[4]\n",
    "    result[\"Headquarters Link\"] = df[cols[1]]\n",
    "    \n",
    "    result[\"Primary industry\"] = None\n",
    "    \n",
    "    print result.shape\n",
    "\n",
    "    final_second = final_second.append(result)\n",
    "\n",
    "print final_second.shape\n",
    "final_second.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "44 companies\n",
      "Fetched info about 44 companies.\n"
     ]
    }
   ],
   "source": [
    "# fetch missing values from Wikipedia\n",
    "import os\n",
    "import json\n",
    "import wptools\n",
    "from slugify import slugify\n",
    "\n",
    "# get unique values\n",
    "companies = list(final_first.append(final_second)['Company'].unique())\n",
    "print \"%s companies\"%len(companies)\n",
    "\n",
    "# Count files in folder\n",
    "# print len(os.listdir('./companies'))\n",
    "\n",
    "companies_info = {}\n",
    "\n",
    "for company_name in companies:\n",
    "    \n",
    "    if type(company_name) is str  :\n",
    "        \n",
    "        json_file_name = \"companies/%s.json\"%slugify(company_name.decode('utf-8'))\n",
    "\n",
    "        if not os.path.exists(json_file_name): \n",
    "            print \"fetching %s...\"%company_name\n",
    "            # fetch infobox\n",
    "            page = wptools.page(company_name).get_parse()\n",
    "            infobox = page.data['infobox']\n",
    "\n",
    "            #save infobox as json file\n",
    "\n",
    "            print json_file_name\n",
    "            with open(json_file_name, 'w') as outfile:\n",
    "                json.dump(dict(infobox), outfile)\n",
    "            print \"saved to %s\"%json_file_name\n",
    "            print \n",
    "        else : \n",
    "            with open(json_file_name, 'r') as outfile:\n",
    "                companies_info[company_name] = json.load(outfile)\n",
    "\n",
    "print \"Fetched info about %s companies.\"%len(companies_info.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Industries parsed.\n"
     ]
    }
   ],
   "source": [
    "# sort by categories\n",
    "\n",
    "# Plateformes numériques :  Conglomerate (Alphabet + Tencent)  Internet (alibaba), Facebook, Amazon, Apple (>juillet 2008), Microsoft (>juin 2009)\n",
    "# Technologie de l’information : Cloud computing (IBM), Computer software, Computer hardware, Consumer electronics, Apple (<juillet 2008), Microsoft (<juin 2009)\n",
    "# Banque & finance: Banking, Conglomerate (Berskshire), Financial services\n",
    "# Energie : Energy industry, Oil and gas, \n",
    "# Médical & Pharma : Pharmaceutical industry, Medical equipment\n",
    "# Télécom : Telecommunications\n",
    "# Distribution : Retail, Online shopping, Fast-moving consumer goods  (1 seul : Procter & Gamble)\n",
    "# Equipements et resources : Conglomerate (GE), Automotive industry, Food processing, Metals,\n",
    "\n",
    "\n",
    "# parse industry for each result\n",
    "industries = {}\n",
    "\n",
    "french_names = {\n",
    "    \"Plateformes numériques\" : [\"Internet\", \"Cloud computing\", ],\n",
    "    \"Technologie de l’information\" : [\"Computer software\", \"Computer hardware\", \"Consumer electronics\"],\n",
    "    \"Banque & Finance\" : [\"Bank\", \"Banking\", \"Financial services\"],\n",
    "    \"Energie\" : [\"Energy industry\", \"Oil and gas\", \"Petroleum industry\", \"List of petroleum companies\", \"Oil and gas industry\"],\n",
    "    \"Médical & Pharma\" : [\"Pharmaceutical industry\", \"Medical equipment\"],\n",
    "    \"Distribution\" : [\"Retail\", \"Online shopping\", \"Fast-moving consumer goods\"],\n",
    "    \"Télécom\" : [\"Telecommunication\", \"Telecommunications\"],\n",
    "    \"Equipements & Ressources\" : [\"Automotive industry\", \"Food processing\", \"Metals\"]\n",
    "}\n",
    "\n",
    "translations = {}\n",
    "for fr in french_names: \n",
    "    for name in french_names[fr] :\n",
    "        translations[name] = fr \n",
    "# print translations\n",
    "        \n",
    "for company_name in companies_info :\n",
    "    info = companies_info[company_name]    \n",
    "    s = info[u'industry']\n",
    "    industry = s[s.find(\"[[\")+1:s.find(\"]]\")][1:].split(\"|\")[0]\n",
    "        \n",
    "    if industry == \"Conglomerate  (company)\" or industry == \"Conglomerate (company)\":\n",
    "        if company_name == \"Berkshire Hathaway\":\n",
    "            industries[company_name] = \"Banque & Finance\"\n",
    "        elif company_name == \"General Electric\":\n",
    "            industries[company_name] = \"Equipements & Ressources\"\n",
    "        elif company_name == \"Alphabet Inc.\":\n",
    "            industries[company_name] = \"Plateformes numériques\"\n",
    "        elif company_name == \"Tencent\":\n",
    "            industries[company_name] = \"Plateformes numériques\"         \n",
    "    else :\n",
    "        if company_name == \"Amazon.com\": \n",
    "            industries[company_name] = \"Plateformes numériques\"\n",
    "        else :\n",
    "            industries[company_name] = translations[industry]\n",
    "#         print company_name\n",
    "\n",
    "# print industries.keys()\n",
    "print 'Industries parsed.'\n",
    "# print industries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(480, 11)\n",
      "results saved to \"./final.csv\"\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Company</th>\n",
       "      <th>Company Link</th>\n",
       "      <th>Headquarters</th>\n",
       "      <th>Id</th>\n",
       "      <th>Market value</th>\n",
       "      <th>Primary industry</th>\n",
       "      <th>Quarter</th>\n",
       "      <th>Rank</th>\n",
       "      <th>Year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Petrochina</td>\n",
       "      <td>https://en.wikipedia.org/wiki/Petrochina</td>\n",
       "      <td>China</td>\n",
       "      <td>2007-Q4-1</td>\n",
       "      <td>723,952</td>\n",
       "      <td>Energie</td>\n",
       "      <td>Q4</td>\n",
       "      <td>1</td>\n",
       "      <td>2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Exxon Mobil</td>\n",
       "      <td>https://en.wikipedia.org/wiki/Exxon_Mobil</td>\n",
       "      <td>United States</td>\n",
       "      <td>2007-Q4-2</td>\n",
       "      <td>511,887</td>\n",
       "      <td>Energie</td>\n",
       "      <td>Q4</td>\n",
       "      <td>2</td>\n",
       "      <td>2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>General Electric</td>\n",
       "      <td>https://en.wikipedia.org/wiki/General_Electric</td>\n",
       "      <td>United States</td>\n",
       "      <td>2007-Q4-3</td>\n",
       "      <td>374,637</td>\n",
       "      <td>Equipements &amp; Ressources</td>\n",
       "      <td>Q4</td>\n",
       "      <td>3</td>\n",
       "      <td>2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>China Mobile</td>\n",
       "      <td>https://en.wikipedia.org/wiki/China_Mobile</td>\n",
       "      <td>Hong Kong</td>\n",
       "      <td>2007-Q4-4</td>\n",
       "      <td>354,120</td>\n",
       "      <td>Télécom</td>\n",
       "      <td>Q4</td>\n",
       "      <td>4</td>\n",
       "      <td>2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Industrial and Commercial Bank of China</td>\n",
       "      <td>https://en.wikipedia.org/wiki/Industrial_and_C...</td>\n",
       "      <td>China</td>\n",
       "      <td>2007-Q4-5</td>\n",
       "      <td>338,989</td>\n",
       "      <td>Banque &amp; Finance</td>\n",
       "      <td>Q4</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   Company  \\\n",
       "0                               Petrochina   \n",
       "1                              Exxon Mobil   \n",
       "2                         General Electric   \n",
       "3                             China Mobile   \n",
       "4  Industrial and Commercial Bank of China   \n",
       "\n",
       "                                        Company Link   Headquarters  \\\n",
       "0           https://en.wikipedia.org/wiki/Petrochina          China   \n",
       "1          https://en.wikipedia.org/wiki/Exxon_Mobil  United States   \n",
       "2     https://en.wikipedia.org/wiki/General_Electric  United States   \n",
       "3         https://en.wikipedia.org/wiki/China_Mobile      Hong Kong   \n",
       "4  https://en.wikipedia.org/wiki/Industrial_and_C...          China   \n",
       "\n",
       "          Id Market value          Primary industry Quarter  Rank  Year  \n",
       "0  2007-Q4-1      723,952                   Energie      Q4     1  2007  \n",
       "1  2007-Q4-2      511,887                   Energie      Q4     2  2007  \n",
       "2  2007-Q4-3      374,637  Equipements & Ressources      Q4     3  2007  \n",
       "3  2007-Q4-4      354,120                   Télécom      Q4     4  2007  \n",
       "4  2007-Q4-5      338,989          Banque & Finance      Q4     5  2007  "
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# merge both\n",
    "final = final_first.append(final_second)\n",
    "print final.shape\n",
    "\n",
    "# remap all values using updated data\n",
    "final[\"Primary industry\"] = final[\"Company\"]\n",
    "final = final.replace({ \"Primary industry\" : industries })\n",
    "# print final.replace({ \"Primary industry\" : industries })\n",
    "\n",
    "# move Apple and Microsoft after 2016\n",
    "final.ix[ (final['Company'] == \"Apple Inc.\") & (final['Year'] > 2008), \"Primary industry\"] = \"Plateformes numériques\" \n",
    "final.ix[(final['Company'] == \"Microsoft\") & (final['Year'] > 2008), \"Primary industry\"] = \"Plateformes numériques\"\n",
    "\n",
    "# some final clean\n",
    "final = final.drop([\"Notes\", \"Headquarters Link\"], axis=1)\n",
    "final[\"Market value\"] = final[\"Market value\"].str.split('[', expand=True)[0] # remove\n",
    "final[\"Company Link\"] = final[\"Company Link\"].str[1:-1]\n",
    "# final[\"Primary industry\"] = final[\"Primary industry\"].str.split('  ', expand=True)[0] # remove\n",
    "final.head()\n",
    "\n",
    "# save as file\n",
    "final.to_csv(\"Data - List of public corporations by market capitalization - FINAL.csv\", index=False)\n",
    "print 'results saved to \"./final.csv\"'\n",
    "\n",
    "final.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "results saved.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>2006-Q1</th>\n",
       "      <th>2006-Q2</th>\n",
       "      <th>2006-Q3</th>\n",
       "      <th>2006-Q4</th>\n",
       "      <th>2007-Q1</th>\n",
       "      <th>2007-Q2</th>\n",
       "      <th>2007-Q3</th>\n",
       "      <th>2007-Q4</th>\n",
       "      <th>2008-Q1</th>\n",
       "      <th>2008-Q2</th>\n",
       "      <th>...</th>\n",
       "      <th>2015-Q3</th>\n",
       "      <th>2015-Q4</th>\n",
       "      <th>2016-Q1</th>\n",
       "      <th>2016-Q2</th>\n",
       "      <th>2016-Q3</th>\n",
       "      <th>2016-Q4</th>\n",
       "      <th>2017-Q1</th>\n",
       "      <th>2017-Q2</th>\n",
       "      <th>2017-Q3</th>\n",
       "      <th>2017-Q4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Banque &amp; Finance</th>\n",
       "      <td>450641.0</td>\n",
       "      <td>661220.0</td>\n",
       "      <td>698952.0</td>\n",
       "      <td>768041.0</td>\n",
       "      <td>705822.0</td>\n",
       "      <td>470666.0</td>\n",
       "      <td>734497.0</td>\n",
       "      <td>338989.0</td>\n",
       "      <td>277236.0</td>\n",
       "      <td>439191.1</td>\n",
       "      <td>...</td>\n",
       "      <td>318180.0</td>\n",
       "      <td>850646.0</td>\n",
       "      <td>595775.0</td>\n",
       "      <td>345860.0</td>\n",
       "      <td>358300.0</td>\n",
       "      <td>989937.54</td>\n",
       "      <td>1003157.0</td>\n",
       "      <td>418880.0</td>\n",
       "      <td>451840.0</td>\n",
       "      <td>860540.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Distribution</th>\n",
       "      <td>196860.0</td>\n",
       "      <td>200762.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>215640.0</td>\n",
       "      <td>188752.0</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Energie</th>\n",
       "      <td>1012510.0</td>\n",
       "      <td>1075604.0</td>\n",
       "      <td>1085531.3</td>\n",
       "      <td>1162849.0</td>\n",
       "      <td>889496.0</td>\n",
       "      <td>1215912.0</td>\n",
       "      <td>1038008.0</td>\n",
       "      <td>2084619.0</td>\n",
       "      <td>1396375.0</td>\n",
       "      <td>1128235.9</td>\n",
       "      <td>...</td>\n",
       "      <td>304245.0</td>\n",
       "      <td>325167.0</td>\n",
       "      <td>350991.0</td>\n",
       "      <td>383396.0</td>\n",
       "      <td>358519.0</td>\n",
       "      <td>374280.00</td>\n",
       "      <td>339897.0</td>\n",
       "      <td>341947.0</td>\n",
       "      <td>348248.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Equipements &amp; Ressources</th>\n",
       "      <td>559258.0</td>\n",
       "      <td>342731.0</td>\n",
       "      <td>364414.0</td>\n",
       "      <td>624725.0</td>\n",
       "      <td>594443.0</td>\n",
       "      <td>621840.0</td>\n",
       "      <td>424191.0</td>\n",
       "      <td>374637.0</td>\n",
       "      <td>369569.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>481430.0</td>\n",
       "      <td>313892.0</td>\n",
       "      <td>295546.0</td>\n",
       "      <td>280927.0</td>\n",
       "      <td>261876.0</td>\n",
       "      <td>279545.92</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Médical &amp; Pharma</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>206785.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>156515.9</td>\n",
       "      <td>...</td>\n",
       "      <td>498010.0</td>\n",
       "      <td>287153.0</td>\n",
       "      <td>300604.0</td>\n",
       "      <td>328234.0</td>\n",
       "      <td>320836.0</td>\n",
       "      <td>313432.46</td>\n",
       "      <td>337947.0</td>\n",
       "      <td>357310.0</td>\n",
       "      <td>347497.0</td>\n",
       "      <td>375360.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            2006-Q1    2006-Q2    2006-Q3    2006-Q4  \\\n",
       "Banque & Finance           450641.0   661220.0   698952.0   768041.0   \n",
       "Distribution               196860.0   200762.0        NaN        NaN   \n",
       "Energie                   1012510.0  1075604.0  1085531.3  1162849.0   \n",
       "Equipements & Ressources   559258.0   342731.0   364414.0   624725.0   \n",
       "Médical & Pharma                NaN        NaN   206785.0        NaN   \n",
       "\n",
       "                           2007-Q1    2007-Q2    2007-Q3    2007-Q4  \\\n",
       "Banque & Finance          705822.0   470666.0   734497.0   338989.0   \n",
       "Distribution                   NaN        NaN        NaN        NaN   \n",
       "Energie                   889496.0  1215912.0  1038008.0  2084619.0   \n",
       "Equipements & Ressources  594443.0   621840.0   424191.0   374637.0   \n",
       "Médical & Pharma               NaN        NaN        NaN        NaN   \n",
       "\n",
       "                            2008-Q1    2008-Q2    ...      2015-Q3   2015-Q4  \\\n",
       "Banque & Finance           277236.0   439191.1    ...     318180.0  850646.0   \n",
       "Distribution               215640.0   188752.0    ...          NaN       NaN   \n",
       "Energie                   1396375.0  1128235.9    ...     304245.0  325167.0   \n",
       "Equipements & Ressources   369569.0        NaN    ...     481430.0  313892.0   \n",
       "Médical & Pharma                NaN   156515.9    ...     498010.0  287153.0   \n",
       "\n",
       "                           2016-Q1   2016-Q2   2016-Q3    2016-Q4    2017-Q1  \\\n",
       "Banque & Finance          595775.0  345860.0  358300.0  989937.54  1003157.0   \n",
       "Distribution                   NaN       NaN       NaN        NaN        NaN   \n",
       "Energie                   350991.0  383396.0  358519.0  374280.00   339897.0   \n",
       "Equipements & Ressources  295546.0  280927.0  261876.0  279545.92        NaN   \n",
       "Médical & Pharma          300604.0  328234.0  320836.0  313432.46   337947.0   \n",
       "\n",
       "                           2017-Q2   2017-Q3   2017-Q4  \n",
       "Banque & Finance          418880.0  451840.0  860540.0  \n",
       "Distribution                   NaN       NaN       NaN  \n",
       "Energie                   341947.0  348248.0       NaN  \n",
       "Equipements & Ressources       NaN       NaN       NaN  \n",
       "Médical & Pharma          357310.0  347497.0  375360.0  \n",
       "\n",
       "[5 rows x 48 columns]"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort result by year/quarter/industry\n",
    "\n",
    "results_by_quarter = {}\n",
    "\n",
    "# get all results by quarter\n",
    "for index, row in final.sort_values(\"Id\").iterrows():\n",
    "    \n",
    "    quarter = \"%s-%s\"%(row[\"Year\"], row[\"Quarter\"])\n",
    "    \n",
    "    try :\n",
    "        results_by_quarter[quarter].append((row[\"Primary industry\"], row[\"Market value\"]))\n",
    "    except : \n",
    "        results_by_quarter[quarter] = [ (row[\"Primary industry\"], row[\"Market value\"]) ]\n",
    "\n",
    "# sum up results by industry        \n",
    "final_results = {}\n",
    "\n",
    "for q in results_by_quarter:\n",
    "    q_results = {} \n",
    "    for res in results_by_quarter[q]:\n",
    "        if res[1] is not None: \n",
    "            industry = res[0]\n",
    "            amount = float(''.join(res[1].split(\",\")))\n",
    "            \n",
    "            try : \n",
    "                q_results[industry] = q_results[industry]+amount\n",
    "            except KeyError:\n",
    "                q_results[industry] = amount\n",
    "    \n",
    "    final_results[q] = q_results\n",
    "    # make percent\n",
    "#     total = sum([q_results[qr] for qr in q_results])\n",
    "#     final_results[q] = { qr : q_results[qr]/total*100 for qr in q_results}\n",
    "\n",
    "df = pd.DataFrame(final_results)\n",
    "df.to_csv(\"Data - Market capitalization by Industries.csv\")\n",
    "print 'results saved.'\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }