Skip to content

Instantly share code, notes, and snippets.

@jrjames83
Created February 21, 2019 00:58
Show Gist options
  • Save jrjames83/2cebb00b4c51360a35b5a5bafb8547a0 to your computer and use it in GitHub Desktop.
Save jrjames83/2cebb00b4c51360a35b5a5bafb8547a0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"rows = []\n",
"\n",
"with open('/Users/jeff/Downloads/count_2w.txt', 'r') as f:\n",
" data = f.read().split(\"\\n\")\n",
" for row in data:\n",
" try:\n",
" keyword, frequency = row.split(\"\\t\")\n",
" rows.append({\n",
" \"kw\":keyword,\n",
" \"freq\": frequency\n",
" })\n",
" except:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame.from_records(rows)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>freq</th>\n",
" <th>kw</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>523545</td>\n",
" <td>0Uplink verified</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>116103</td>\n",
" <td>0km to</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>939476</td>\n",
" <td>1000s of</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>539389</td>\n",
" <td>100s of</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>158621</td>\n",
" <td>100th anniversary</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" freq kw\n",
"0 523545 0Uplink verified\n",
"1 116103 0km to\n",
"2 939476 1000s of\n",
"3 539389 100s of\n",
"4 158621 100th anniversary"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>freq</th>\n",
" <th>kw</th>\n",
" <th>first_term</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>523545</td>\n",
" <td>0Uplink verified</td>\n",
" <td>0Uplink</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>116103</td>\n",
" <td>0km to</td>\n",
" <td>0km</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>939476</td>\n",
" <td>1000s of</td>\n",
" <td>1000s</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>539389</td>\n",
" <td>100s of</td>\n",
" <td>100s</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>158621</td>\n",
" <td>100th anniversary</td>\n",
" <td>100th</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" freq kw first_term\n",
"0 523545 0Uplink verified 0Uplink\n",
"1 116103 0km to 0km\n",
"2 939476 1000s of 1000s\n",
"3 539389 100s of 100s\n",
"4 158621 100th anniversary 100th"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['first_term'] = df['kw'].map(lambda x: x.split()[0])\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"out = df.groupby('first_term')['kw'].apply(list).to_frame()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>kw</th>\n",
" <th>nbr_assoc_kws</th>\n",
" </tr>\n",
" <tr>\n",
" <th>first_term</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0Uplink</th>\n",
" <td>[0Uplink verified]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0km</th>\n",
" <td>[0km to]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000s</th>\n",
" <td>[1000s of]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100s</th>\n",
" <td>[100s of]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100th</th>\n",
" <td>[100th anniversary]</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" kw nbr_assoc_kws\n",
"first_term \n",
"0Uplink [0Uplink verified] 1\n",
"0km [0km to] 1\n",
"1000s [1000s of] 1\n",
"100s [100s of] 1\n",
"100th [100th anniversary] 1"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out['nbr_assoc_kws'] = out['kw'].map(lambda x: len(x))\n",
"out.head()\n",
"# Terms with the most variants having that root?"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>kw</th>\n",
" <th>nbr_assoc_kws</th>\n",
" </tr>\n",
" <tr>\n",
" <th>first_term</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>basketball</th>\n",
" <td>[basketball and, basketball betting, basketbal...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <td>[Index and, Index by, Index data, Index for, I...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>raw</th>\n",
" <td>[raw and, raw data, raw food, raw material, ra...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>touched</th>\n",
" <td>[touched a, touched by, touched down, touched ...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bars</th>\n",
" <td>[bars and, bars are, bars for, bars in, bars o...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bangbus</th>\n",
" <td>[bangbus bangbus, bangbus big, bangbus gay, ba...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Institute</th>\n",
" <td>[Institute and, Institute at, Institute for, I...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ireland</th>\n",
" <td>[Ireland and, Ireland are, Ireland by, Ireland...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bag</th>\n",
" <td>[bag and, bag for, bag from, bag in, bag is, b...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>respondents</th>\n",
" <td>[respondents and, respondents are, respondents...</td>\n",
" <td>11</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" kw nbr_assoc_kws\n",
"first_term \n",
"basketball [basketball and, basketball betting, basketbal... 11\n",
"Index [Index and, Index by, Index data, Index for, I... 11\n",
"raw [raw and, raw data, raw food, raw material, ra... 11\n",
"touched [touched a, touched by, touched down, touched ... 11\n",
"bars [bars and, bars are, bars for, bars in, bars o... 11\n",
"bangbus [bangbus bangbus, bangbus big, bangbus gay, ba... 11\n",
"Institute [Institute and, Institute at, Institute for, I... 11\n",
"Ireland [Ireland and, Ireland are, Ireland by, Ireland... 11\n",
"bag [bag and, bag for, bag from, bag in, bag is, b... 11\n",
"respondents [respondents and, respondents are, respondents... 11"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out.query('nbr_assoc_kws > 10')\\\n",
" .sort_values(by='nbr_assoc_kws', ascending=False).tail(10)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"# what's the frequency total for each first_term group? dictionary\n",
"kw_freq_dict = dict(zip(df.kw, df.freq))"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"out['term_volumes'] = out['kw'].map(lambda x: [int(kw_freq_dict.get(j)) for j in x])"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>kw</th>\n",
" <th>nbr_assoc_kws</th>\n",
" <th>term_volumes</th>\n",
" <th>term_volume_sum</th>\n",
" </tr>\n",
" <tr>\n",
" <th>first_term</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0Uplink</th>\n",
" <td>[0Uplink verified]</td>\n",
" <td>1</td>\n",
" <td>[523545]</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0km</th>\n",
" <td>[0km to]</td>\n",
" <td>1</td>\n",
" <td>[116103]</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1000s</th>\n",
" <td>[1000s of]</td>\n",
" <td>1</td>\n",
" <td>[939476]</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100s</th>\n",
" <td>[100s of]</td>\n",
" <td>1</td>\n",
" <td>[539389]</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100th</th>\n",
" <td>[100th anniversary]</td>\n",
" <td>1</td>\n",
" <td>[158621]</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" kw nbr_assoc_kws term_volumes term_volume_sum\n",
"first_term \n",
"0Uplink [0Uplink verified] 1 [523545] checkmelater\n",
"0km [0km to] 1 [116103] checkmelater\n",
"1000s [1000s of] 1 [939476] checkmelater\n",
"100s [100s of] 1 [539389] checkmelater\n",
"100th [100th anniversary] 1 [158621] checkmelater"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out.head()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>kw</th>\n",
" <th>nbr_assoc_kws</th>\n",
" <th>term_volumes</th>\n",
" <th>term_volume_sum</th>\n",
" </tr>\n",
" <tr>\n",
" <th>first_term</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>basketball</th>\n",
" <td>[basketball and, basketball betting, basketbal...</td>\n",
" <td>11</td>\n",
" <td>[323681, 269072, 216666, 185744, 311948, 15249...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <td>[Index and, Index by, Index data, Index for, I...</td>\n",
" <td>11</td>\n",
" <td>[354029, 590581, 105676, 647898, 113591, 25865...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>raw</th>\n",
" <td>[raw and, raw data, raw food, raw material, ra...</td>\n",
" <td>11</td>\n",
" <td>[287549, 1371844, 145483, 884644, 1577636, 121...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>touched</th>\n",
" <td>[touched a, touched by, touched down, touched ...</td>\n",
" <td>11</td>\n",
" <td>[142103, 649108, 109743, 182413, 111843, 10547...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bars</th>\n",
" <td>[bars and, bars are, bars for, bars in, bars o...</td>\n",
" <td>11</td>\n",
" <td>[1466733, 348049, 201923, 424597, 279249, 1839...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bangbus</th>\n",
" <td>[bangbus bangbus, bangbus big, bangbus gay, ba...</td>\n",
" <td>11</td>\n",
" <td>[182417, 186656, 119670, 137929, 136641, 14681...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Institute</th>\n",
" <td>[Institute and, Institute at, Institute for, I...</td>\n",
" <td>11</td>\n",
" <td>[1072662, 447095, 9659333, 272520, 861933, 564...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ireland</th>\n",
" <td>[Ireland and, Ireland are, Ireland by, Ireland...</td>\n",
" <td>11</td>\n",
" <td>[1178369, 107952, 106833, 182818, 176488, 3250...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bag</th>\n",
" <td>[bag and, bag for, bag from, bag in, bag is, b...</td>\n",
" <td>11</td>\n",
" <td>[896240, 395587, 111243, 262668, 473341, 12701...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" <tr>\n",
" <th>respondents</th>\n",
" <td>[respondents and, respondents are, respondents...</td>\n",
" <td>11</td>\n",
" <td>[108590, 172532, 136584, 100972, 274560, 13352...</td>\n",
" <td>checkmelater</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" kw nbr_assoc_kws \\\n",
"first_term \n",
"basketball [basketball and, basketball betting, basketbal... 11 \n",
"Index [Index and, Index by, Index data, Index for, I... 11 \n",
"raw [raw and, raw data, raw food, raw material, ra... 11 \n",
"touched [touched a, touched by, touched down, touched ... 11 \n",
"bars [bars and, bars are, bars for, bars in, bars o... 11 \n",
"bangbus [bangbus bangbus, bangbus big, bangbus gay, ba... 11 \n",
"Institute [Institute and, Institute at, Institute for, I... 11 \n",
"Ireland [Ireland and, Ireland are, Ireland by, Ireland... 11 \n",
"bag [bag and, bag for, bag from, bag in, bag is, b... 11 \n",
"respondents [respondents and, respondents are, respondents... 11 \n",
"\n",
" term_volumes term_volume_sum \n",
"first_term \n",
"basketball [323681, 269072, 216666, 185744, 311948, 15249... checkmelater \n",
"Index [354029, 590581, 105676, 647898, 113591, 25865... checkmelater \n",
"raw [287549, 1371844, 145483, 884644, 1577636, 121... checkmelater \n",
"touched [142103, 649108, 109743, 182413, 111843, 10547... checkmelater \n",
"bars [1466733, 348049, 201923, 424597, 279249, 1839... checkmelater \n",
"bangbus [182417, 186656, 119670, 137929, 136641, 14681... checkmelater \n",
"Institute [1072662, 447095, 9659333, 272520, 861933, 564... checkmelater \n",
"Ireland [1178369, 107952, 106833, 182818, 176488, 3250... checkmelater \n",
"bag [896240, 395587, 111243, 262668, 473341, 12701... checkmelater \n",
"respondents [108590, 172532, 136584, 100972, 274560, 13352... checkmelater "
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out.query('nbr_assoc_kws > 10')\\\n",
" .sort_values(by='nbr_assoc_kws', ascending=False).tail(10)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"def sum_term_vols(row):\n",
" try:\n",
" return sum(row)\n",
" except:\n",
" return \"checkmelater\""
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"out['term_volume_sum'] = out.term_volumes.map(sum_term_vols)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>kw</th>\n",
" <th>nbr_assoc_kws</th>\n",
" <th>term_volumes</th>\n",
" <th>term_volume_sum</th>\n",
" </tr>\n",
" <tr>\n",
" <th>first_term</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>basketball</th>\n",
" <td>[basketball and, basketball betting, basketbal...</td>\n",
" <td>11</td>\n",
" <td>[323681, 269072, 216666, 185744, 311948, 15249...</td>\n",
" <td>3016481</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Index</th>\n",
" <td>[Index and, Index by, Index data, Index for, I...</td>\n",
" <td>11</td>\n",
" <td>[354029, 590581, 105676, 647898, 113591, 25865...</td>\n",
" <td>21941249</td>\n",
" </tr>\n",
" <tr>\n",
" <th>raw</th>\n",
" <td>[raw and, raw data, raw food, raw material, ra...</td>\n",
" <td>11</td>\n",
" <td>[287549, 1371844, 145483, 884644, 1577636, 121...</td>\n",
" <td>5331329</td>\n",
" </tr>\n",
" <tr>\n",
" <th>touched</th>\n",
" <td>[touched a, touched by, touched down, touched ...</td>\n",
" <td>11</td>\n",
" <td>[142103, 649108, 109743, 182413, 111843, 10547...</td>\n",
" <td>2572699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bars</th>\n",
" <td>[bars and, bars are, bars for, bars in, bars o...</td>\n",
" <td>11</td>\n",
" <td>[1466733, 348049, 201923, 424597, 279249, 1839...</td>\n",
" <td>3712886</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bangbus</th>\n",
" <td>[bangbus bangbus, bangbus big, bangbus gay, ba...</td>\n",
" <td>11</td>\n",
" <td>[182417, 186656, 119670, 137929, 136641, 14681...</td>\n",
" <td>1581954</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Institute</th>\n",
" <td>[Institute and, Institute at, Institute for, I...</td>\n",
" <td>11</td>\n",
" <td>[1072662, 447095, 9659333, 272520, 861933, 564...</td>\n",
" <td>37220210</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Ireland</th>\n",
" <td>[Ireland and, Ireland are, Ireland by, Ireland...</td>\n",
" <td>11</td>\n",
" <td>[1178369, 107952, 106833, 182818, 176488, 3250...</td>\n",
" <td>3621386</td>\n",
" </tr>\n",
" <tr>\n",
" <th>bag</th>\n",
" <td>[bag and, bag for, bag from, bag in, bag is, b...</td>\n",
" <td>11</td>\n",
" <td>[896240, 395587, 111243, 262668, 473341, 12701...</td>\n",
" <td>4749447</td>\n",
" </tr>\n",
" <tr>\n",
" <th>respondents</th>\n",
" <td>[respondents and, respondents are, respondents...</td>\n",
" <td>11</td>\n",
" <td>[108590, 172532, 136584, 100972, 274560, 13352...</td>\n",
" <td>2527528</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" kw nbr_assoc_kws \\\n",
"first_term \n",
"basketball [basketball and, basketball betting, basketbal... 11 \n",
"Index [Index and, Index by, Index data, Index for, I... 11 \n",
"raw [raw and, raw data, raw food, raw material, ra... 11 \n",
"touched [touched a, touched by, touched down, touched ... 11 \n",
"bars [bars and, bars are, bars for, bars in, bars o... 11 \n",
"bangbus [bangbus bangbus, bangbus big, bangbus gay, ba... 11 \n",
"Institute [Institute and, Institute at, Institute for, I... 11 \n",
"Ireland [Ireland and, Ireland are, Ireland by, Ireland... 11 \n",
"bag [bag and, bag for, bag from, bag in, bag is, b... 11 \n",
"respondents [respondents and, respondents are, respondents... 11 \n",
"\n",
" term_volumes \\\n",
"first_term \n",
"basketball [323681, 269072, 216666, 185744, 311948, 15249... \n",
"Index [354029, 590581, 105676, 647898, 113591, 25865... \n",
"raw [287549, 1371844, 145483, 884644, 1577636, 121... \n",
"touched [142103, 649108, 109743, 182413, 111843, 10547... \n",
"bars [1466733, 348049, 201923, 424597, 279249, 1839... \n",
"bangbus [182417, 186656, 119670, 137929, 136641, 14681... \n",
"Institute [1072662, 447095, 9659333, 272520, 861933, 564... \n",
"Ireland [1178369, 107952, 106833, 182818, 176488, 3250... \n",
"bag [896240, 395587, 111243, 262668, 473341, 12701... \n",
"respondents [108590, 172532, 136584, 100972, 274560, 13352... \n",
"\n",
" term_volume_sum \n",
"first_term \n",
"basketball 3016481 \n",
"Index 21941249 \n",
"raw 5331329 \n",
"touched 2572699 \n",
"bars 3712886 \n",
"bangbus 1581954 \n",
"Institute 37220210 \n",
"Ireland 3621386 \n",
"bag 4749447 \n",
"respondents 2527528 "
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out.query('nbr_assoc_kws > 10')\\\n",
" .sort_values(by='nbr_assoc_kws', ascending=False).tail(10)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['197201']"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out.iloc[1299].term_volumes"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>keyword</th>\n",
" <th>frequency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0km to</td>\n",
" <td>116103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1000s of</td>\n",
" <td>939476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>100s of</td>\n",
" <td>539389</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>100th anniversary</td>\n",
" <td>158621</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10am to</td>\n",
" <td>376141</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" keyword frequency\n",
"0 0km to 116103\n",
"1 1000s of 939476\n",
"2 100s of 539389\n",
"3 100th anniversary 158621\n",
"4 10am to 376141"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = pd.read_csv('/Users/jeff/Downloads/count_2w.txt', sep='\\t')\n",
"x.columns = ['keyword', 'frequency']\n",
"x.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment