jrjames83 · February 21, 2019 00:58
diff --git a/Python KW Grouping Strategies.ipynb b/Python KW Grouping Strategies.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "rows = []\n",
    "\n",
    "with open('/Users/jeff/Downloads/count_2w.txt', 'r') as f:\n",
    "    data = f.read().split(\"\\n\")\n",
    "    for row in data:\n",
    "        try:\n",
    "            keyword, frequency = row.split(\"\\t\")\n",
    "            rows.append({\n",
    "                \"kw\":keyword,\n",
    "                \"freq\": frequency\n",
    "            })\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame.from_records(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>freq</th>\n",
       "      <th>kw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>523545</td>\n",
       "      <td>0Uplink verified</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>116103</td>\n",
       "      <td>0km to</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>939476</td>\n",
       "      <td>1000s of</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>539389</td>\n",
       "      <td>100s of</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>158621</td>\n",
       "      <td>100th anniversary</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     freq                 kw\n",
       "0  523545   0Uplink verified\n",
       "1  116103             0km to\n",
       "2  939476           1000s of\n",
       "3  539389            100s of\n",
       "4  158621  100th anniversary"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>freq</th>\n",
       "      <th>kw</th>\n",
       "      <th>first_term</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>523545</td>\n",
       "      <td>0Uplink verified</td>\n",
       "      <td>0Uplink</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>116103</td>\n",
       "      <td>0km to</td>\n",
       "      <td>0km</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>939476</td>\n",
       "      <td>1000s of</td>\n",
       "      <td>1000s</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>539389</td>\n",
       "      <td>100s of</td>\n",
       "      <td>100s</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>158621</td>\n",
       "      <td>100th anniversary</td>\n",
       "      <td>100th</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     freq                 kw first_term\n",
       "0  523545   0Uplink verified    0Uplink\n",
       "1  116103             0km to        0km\n",
       "2  939476           1000s of      1000s\n",
       "3  539389            100s of       100s\n",
       "4  158621  100th anniversary      100th"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['first_term'] = df['kw'].map(lambda x: x.split()[0])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "out = df.groupby('first_term')['kw'].apply(list).to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>kw</th>\n",
       "      <th>nbr_assoc_kws</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>first_term</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0Uplink</th>\n",
       "      <td>[0Uplink verified]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0km</th>\n",
       "      <td>[0km to]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1000s</th>\n",
       "      <td>[1000s of]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100s</th>\n",
       "      <td>[100s of]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100th</th>\n",
       "      <td>[100th anniversary]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             kw  nbr_assoc_kws\n",
       "first_term                                    \n",
       "0Uplink      [0Uplink verified]              1\n",
       "0km                    [0km to]              1\n",
       "1000s                [1000s of]              1\n",
       "100s                  [100s of]              1\n",
       "100th       [100th anniversary]              1"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out['nbr_assoc_kws'] = out['kw'].map(lambda x: len(x))\n",
    "out.head()\n",
    "# Terms with the most variants having that root?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>kw</th>\n",
       "      <th>nbr_assoc_kws</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>first_term</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>basketball</th>\n",
       "      <td>[basketball and, basketball betting, basketbal...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <td>[Index and, Index by, Index data, Index for, I...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>raw</th>\n",
       "      <td>[raw and, raw data, raw food, raw material, ra...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>touched</th>\n",
       "      <td>[touched a, touched by, touched down, touched ...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bars</th>\n",
       "      <td>[bars and, bars are, bars for, bars in, bars o...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bangbus</th>\n",
       "      <td>[bangbus bangbus, bangbus big, bangbus gay, ba...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Institute</th>\n",
       "      <td>[Institute and, Institute at, Institute for, I...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ireland</th>\n",
       "      <td>[Ireland and, Ireland are, Ireland by, Ireland...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bag</th>\n",
       "      <td>[bag and, bag for, bag from, bag in, bag is, b...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>respondents</th>\n",
       "      <td>[respondents and, respondents are, respondents...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                            kw  nbr_assoc_kws\n",
       "first_term                                                                   \n",
       "basketball   [basketball and, basketball betting, basketbal...             11\n",
       "Index        [Index and, Index by, Index data, Index for, I...             11\n",
       "raw          [raw and, raw data, raw food, raw material, ra...             11\n",
       "touched      [touched a, touched by, touched down, touched ...             11\n",
       "bars         [bars and, bars are, bars for, bars in, bars o...             11\n",
       "bangbus      [bangbus bangbus, bangbus big, bangbus gay, ba...             11\n",
       "Institute    [Institute and, Institute at, Institute for, I...             11\n",
       "Ireland      [Ireland and, Ireland are, Ireland by, Ireland...             11\n",
       "bag          [bag and, bag for, bag from, bag in, bag is, b...             11\n",
       "respondents  [respondents and, respondents are, respondents...             11"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out.query('nbr_assoc_kws > 10')\\\n",
    "    .sort_values(by='nbr_assoc_kws', ascending=False).tail(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# what's the frequency total for each first_term group? dictionary\n",
    "kw_freq_dict = dict(zip(df.kw, df.freq))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "out['term_volumes'] = out['kw'].map(lambda x: [int(kw_freq_dict.get(j)) for j in x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>kw</th>\n",
       "      <th>nbr_assoc_kws</th>\n",
       "      <th>term_volumes</th>\n",
       "      <th>term_volume_sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>first_term</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0Uplink</th>\n",
       "      <td>[0Uplink verified]</td>\n",
       "      <td>1</td>\n",
       "      <td>[523545]</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0km</th>\n",
       "      <td>[0km to]</td>\n",
       "      <td>1</td>\n",
       "      <td>[116103]</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1000s</th>\n",
       "      <td>[1000s of]</td>\n",
       "      <td>1</td>\n",
       "      <td>[939476]</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100s</th>\n",
       "      <td>[100s of]</td>\n",
       "      <td>1</td>\n",
       "      <td>[539389]</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100th</th>\n",
       "      <td>[100th anniversary]</td>\n",
       "      <td>1</td>\n",
       "      <td>[158621]</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                             kw  nbr_assoc_kws term_volumes term_volume_sum\n",
       "first_term                                                                 \n",
       "0Uplink      [0Uplink verified]              1     [523545]    checkmelater\n",
       "0km                    [0km to]              1     [116103]    checkmelater\n",
       "1000s                [1000s of]              1     [939476]    checkmelater\n",
       "100s                  [100s of]              1     [539389]    checkmelater\n",
       "100th       [100th anniversary]              1     [158621]    checkmelater"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>kw</th>\n",
       "      <th>nbr_assoc_kws</th>\n",
       "      <th>term_volumes</th>\n",
       "      <th>term_volume_sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>first_term</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>basketball</th>\n",
       "      <td>[basketball and, basketball betting, basketbal...</td>\n",
       "      <td>11</td>\n",
       "      <td>[323681, 269072, 216666, 185744, 311948, 15249...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <td>[Index and, Index by, Index data, Index for, I...</td>\n",
       "      <td>11</td>\n",
       "      <td>[354029, 590581, 105676, 647898, 113591, 25865...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>raw</th>\n",
       "      <td>[raw and, raw data, raw food, raw material, ra...</td>\n",
       "      <td>11</td>\n",
       "      <td>[287549, 1371844, 145483, 884644, 1577636, 121...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>touched</th>\n",
       "      <td>[touched a, touched by, touched down, touched ...</td>\n",
       "      <td>11</td>\n",
       "      <td>[142103, 649108, 109743, 182413, 111843, 10547...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bars</th>\n",
       "      <td>[bars and, bars are, bars for, bars in, bars o...</td>\n",
       "      <td>11</td>\n",
       "      <td>[1466733, 348049, 201923, 424597, 279249, 1839...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bangbus</th>\n",
       "      <td>[bangbus bangbus, bangbus big, bangbus gay, ba...</td>\n",
       "      <td>11</td>\n",
       "      <td>[182417, 186656, 119670, 137929, 136641, 14681...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Institute</th>\n",
       "      <td>[Institute and, Institute at, Institute for, I...</td>\n",
       "      <td>11</td>\n",
       "      <td>[1072662, 447095, 9659333, 272520, 861933, 564...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ireland</th>\n",
       "      <td>[Ireland and, Ireland are, Ireland by, Ireland...</td>\n",
       "      <td>11</td>\n",
       "      <td>[1178369, 107952, 106833, 182818, 176488, 3250...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bag</th>\n",
       "      <td>[bag and, bag for, bag from, bag in, bag is, b...</td>\n",
       "      <td>11</td>\n",
       "      <td>[896240, 395587, 111243, 262668, 473341, 12701...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>respondents</th>\n",
       "      <td>[respondents and, respondents are, respondents...</td>\n",
       "      <td>11</td>\n",
       "      <td>[108590, 172532, 136584, 100972, 274560, 13352...</td>\n",
       "      <td>checkmelater</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                            kw  nbr_assoc_kws  \\\n",
       "first_term                                                                      \n",
       "basketball   [basketball and, basketball betting, basketbal...             11   \n",
       "Index        [Index and, Index by, Index data, Index for, I...             11   \n",
       "raw          [raw and, raw data, raw food, raw material, ra...             11   \n",
       "touched      [touched a, touched by, touched down, touched ...             11   \n",
       "bars         [bars and, bars are, bars for, bars in, bars o...             11   \n",
       "bangbus      [bangbus bangbus, bangbus big, bangbus gay, ba...             11   \n",
       "Institute    [Institute and, Institute at, Institute for, I...             11   \n",
       "Ireland      [Ireland and, Ireland are, Ireland by, Ireland...             11   \n",
       "bag          [bag and, bag for, bag from, bag in, bag is, b...             11   \n",
       "respondents  [respondents and, respondents are, respondents...             11   \n",
       "\n",
       "                                                  term_volumes term_volume_sum  \n",
       "first_term                                                                      \n",
       "basketball   [323681, 269072, 216666, 185744, 311948, 15249...    checkmelater  \n",
       "Index        [354029, 590581, 105676, 647898, 113591, 25865...    checkmelater  \n",
       "raw          [287549, 1371844, 145483, 884644, 1577636, 121...    checkmelater  \n",
       "touched      [142103, 649108, 109743, 182413, 111843, 10547...    checkmelater  \n",
       "bars         [1466733, 348049, 201923, 424597, 279249, 1839...    checkmelater  \n",
       "bangbus      [182417, 186656, 119670, 137929, 136641, 14681...    checkmelater  \n",
       "Institute    [1072662, 447095, 9659333, 272520, 861933, 564...    checkmelater  \n",
       "Ireland      [1178369, 107952, 106833, 182818, 176488, 3250...    checkmelater  \n",
       "bag          [896240, 395587, 111243, 262668, 473341, 12701...    checkmelater  \n",
       "respondents  [108590, 172532, 136584, 100972, 274560, 13352...    checkmelater  "
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out.query('nbr_assoc_kws > 10')\\\n",
    "    .sort_values(by='nbr_assoc_kws', ascending=False).tail(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sum_term_vols(row):\n",
    "    try:\n",
    "        return sum(row)\n",
    "    except:\n",
    "        return \"checkmelater\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "out['term_volume_sum'] = out.term_volumes.map(sum_term_vols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>kw</th>\n",
       "      <th>nbr_assoc_kws</th>\n",
       "      <th>term_volumes</th>\n",
       "      <th>term_volume_sum</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>first_term</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>basketball</th>\n",
       "      <td>[basketball and, basketball betting, basketbal...</td>\n",
       "      <td>11</td>\n",
       "      <td>[323681, 269072, 216666, 185744, 311948, 15249...</td>\n",
       "      <td>3016481</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Index</th>\n",
       "      <td>[Index and, Index by, Index data, Index for, I...</td>\n",
       "      <td>11</td>\n",
       "      <td>[354029, 590581, 105676, 647898, 113591, 25865...</td>\n",
       "      <td>21941249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>raw</th>\n",
       "      <td>[raw and, raw data, raw food, raw material, ra...</td>\n",
       "      <td>11</td>\n",
       "      <td>[287549, 1371844, 145483, 884644, 1577636, 121...</td>\n",
       "      <td>5331329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>touched</th>\n",
       "      <td>[touched a, touched by, touched down, touched ...</td>\n",
       "      <td>11</td>\n",
       "      <td>[142103, 649108, 109743, 182413, 111843, 10547...</td>\n",
       "      <td>2572699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bars</th>\n",
       "      <td>[bars and, bars are, bars for, bars in, bars o...</td>\n",
       "      <td>11</td>\n",
       "      <td>[1466733, 348049, 201923, 424597, 279249, 1839...</td>\n",
       "      <td>3712886</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bangbus</th>\n",
       "      <td>[bangbus bangbus, bangbus big, bangbus gay, ba...</td>\n",
       "      <td>11</td>\n",
       "      <td>[182417, 186656, 119670, 137929, 136641, 14681...</td>\n",
       "      <td>1581954</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Institute</th>\n",
       "      <td>[Institute and, Institute at, Institute for, I...</td>\n",
       "      <td>11</td>\n",
       "      <td>[1072662, 447095, 9659333, 272520, 861933, 564...</td>\n",
       "      <td>37220210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ireland</th>\n",
       "      <td>[Ireland and, Ireland are, Ireland by, Ireland...</td>\n",
       "      <td>11</td>\n",
       "      <td>[1178369, 107952, 106833, 182818, 176488, 3250...</td>\n",
       "      <td>3621386</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bag</th>\n",
       "      <td>[bag and, bag for, bag from, bag in, bag is, b...</td>\n",
       "      <td>11</td>\n",
       "      <td>[896240, 395587, 111243, 262668, 473341, 12701...</td>\n",
       "      <td>4749447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>respondents</th>\n",
       "      <td>[respondents and, respondents are, respondents...</td>\n",
       "      <td>11</td>\n",
       "      <td>[108590, 172532, 136584, 100972, 274560, 13352...</td>\n",
       "      <td>2527528</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                            kw  nbr_assoc_kws  \\\n",
       "first_term                                                                      \n",
       "basketball   [basketball and, basketball betting, basketbal...             11   \n",
       "Index        [Index and, Index by, Index data, Index for, I...             11   \n",
       "raw          [raw and, raw data, raw food, raw material, ra...             11   \n",
       "touched      [touched a, touched by, touched down, touched ...             11   \n",
       "bars         [bars and, bars are, bars for, bars in, bars o...             11   \n",
       "bangbus      [bangbus bangbus, bangbus big, bangbus gay, ba...             11   \n",
       "Institute    [Institute and, Institute at, Institute for, I...             11   \n",
       "Ireland      [Ireland and, Ireland are, Ireland by, Ireland...             11   \n",
       "bag          [bag and, bag for, bag from, bag in, bag is, b...             11   \n",
       "respondents  [respondents and, respondents are, respondents...             11   \n",
       "\n",
       "                                                  term_volumes  \\\n",
       "first_term                                                       \n",
       "basketball   [323681, 269072, 216666, 185744, 311948, 15249...   \n",
       "Index        [354029, 590581, 105676, 647898, 113591, 25865...   \n",
       "raw          [287549, 1371844, 145483, 884644, 1577636, 121...   \n",
       "touched      [142103, 649108, 109743, 182413, 111843, 10547...   \n",
       "bars         [1466733, 348049, 201923, 424597, 279249, 1839...   \n",
       "bangbus      [182417, 186656, 119670, 137929, 136641, 14681...   \n",
       "Institute    [1072662, 447095, 9659333, 272520, 861933, 564...   \n",
       "Ireland      [1178369, 107952, 106833, 182818, 176488, 3250...   \n",
       "bag          [896240, 395587, 111243, 262668, 473341, 12701...   \n",
       "respondents  [108590, 172532, 136584, 100972, 274560, 13352...   \n",
       "\n",
       "             term_volume_sum  \n",
       "first_term                    \n",
       "basketball           3016481  \n",
       "Index               21941249  \n",
       "raw                  5331329  \n",
       "touched              2572699  \n",
       "bars                 3712886  \n",
       "bangbus              1581954  \n",
       "Institute           37220210  \n",
       "Ireland              3621386  \n",
       "bag                  4749447  \n",
       "respondents          2527528  "
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out.query('nbr_assoc_kws > 10')\\\n",
    "    .sort_values(by='nbr_assoc_kws', ascending=False).tail(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['197201']"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out.iloc[1299].term_volumes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>keyword</th>\n",
       "      <th>frequency</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0km to</td>\n",
       "      <td>116103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1000s of</td>\n",
       "      <td>939476</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>100s of</td>\n",
       "      <td>539389</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>100th anniversary</td>\n",
       "      <td>158621</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10am to</td>\n",
       "      <td>376141</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             keyword  frequency\n",
       "0             0km to     116103\n",
       "1           1000s of     939476\n",
       "2            100s of     539389\n",
       "3  100th anniversary     158621\n",
       "4            10am to     376141"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = pd.read_csv('/Users/jeff/Downloads/count_2w.txt', sep='\\t')\n",
    "x.columns = ['keyword', 'frequency']\n",
    "x.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }