Skip to content

Instantly share code, notes, and snippets.

@bmcfee
Created September 3, 2018 20:09
Show Gist options
  • Select an option

  • Save bmcfee/248a74c7f8ccaad611525ff27677eb16 to your computer and use it in GitHub Desktop.

Select an option

Save bmcfee/248a74c7f8ccaad611525ff27677eb16 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ujson as json\n",
"from tqdm import tqdm_notebook as tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib widget\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from tqdm import tqdm_notebook as tqdm\n",
"from sklearn.model_selection import StratifiedShuffleSplit\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"metadata = pd.read_csv('/home/bmcfee/data/openmic20k/openmic-20k-metadata.csv')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"label_data = pd.read_csv('/home/bmcfee/data/openmic20k/openmic-20k-sparse-labels-20180805.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"instruments = sorted(pd.unique(label_data['instrument']))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "63b1c513b5d6484a8f8b06ccd275ef3e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"features = dict()\n",
"for key in tqdm(metadata['sample_key']):\n",
" features[key] = np.ravel(np.asarray(json.load(open('/home/bmcfee/data/openmic20k/vggish/{}/{}.json'.format(key[:3], key), 'r'))['features'], dtype=np.uint8))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"features_all = pd.DataFrame.from_dict(features, orient='index')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>1270</th>\n",
" <th>1271</th>\n",
" <th>1272</th>\n",
" <th>1273</th>\n",
" <th>1274</th>\n",
" <th>1275</th>\n",
" <th>1276</th>\n",
" <th>1277</th>\n",
" <th>1278</th>\n",
" <th>1279</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>017059_46080</th>\n",
" <td>186</td>\n",
" <td>38</td>\n",
" <td>241</td>\n",
" <td>76</td>\n",
" <td>164</td>\n",
" <td>31</td>\n",
" <td>62</td>\n",
" <td>63</td>\n",
" <td>149</td>\n",
" <td>223</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>138</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>168</td>\n",
" <td>255</td>\n",
" <td>255</td>\n",
" <td>83</td>\n",
" <td>255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>041800_495360</th>\n",
" <td>177</td>\n",
" <td>6</td>\n",
" <td>149</td>\n",
" <td>97</td>\n",
" <td>178</td>\n",
" <td>95</td>\n",
" <td>99</td>\n",
" <td>119</td>\n",
" <td>160</td>\n",
" <td>175</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>227</td>\n",
" <td>121</td>\n",
" <td>161</td>\n",
" <td>84</td>\n",
" <td>119</td>\n",
" <td>117</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>095447_364800</th>\n",
" <td>185</td>\n",
" <td>29</td>\n",
" <td>175</td>\n",
" <td>114</td>\n",
" <td>202</td>\n",
" <td>49</td>\n",
" <td>110</td>\n",
" <td>136</td>\n",
" <td>129</td>\n",
" <td>229</td>\n",
" <td>...</td>\n",
" <td>48</td>\n",
" <td>204</td>\n",
" <td>203</td>\n",
" <td>1</td>\n",
" <td>107</td>\n",
" <td>0</td>\n",
" <td>255</td>\n",
" <td>52</td>\n",
" <td>43</td>\n",
" <td>255</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3 rows × 1280 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9 \\\n",
"017059_46080 186 38 241 76 164 31 62 63 149 223 \n",
"041800_495360 177 6 149 97 178 95 99 119 160 175 \n",
"095447_364800 185 29 175 114 202 49 110 136 129 229 \n",
"\n",
" ... 1270 1271 1272 1273 1274 1275 1276 1277 1278 \\\n",
"017059_46080 ... 0 138 0 0 0 168 255 255 83 \n",
"041800_495360 ... 0 227 121 161 84 119 117 4 0 \n",
"095447_364800 ... 48 204 203 1 107 0 255 52 43 \n",
"\n",
" 1279 \n",
"017059_46080 255 \n",
"041800_495360 255 \n",
"095447_364800 255 \n",
"\n",
"[3 rows x 1280 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"features_all.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.neighbors import NearestNeighbors"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"NN = NearestNeighbors(n_neighbors=10, p=1, radius=1)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=1, n_neighbors=10, p=1, radius=1)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"NN.fit(features_all)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"nn_dist, nn_idx = NN.radius_neighbors(features_all)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"nn_idx = [set(_) - set([__]) for __, _ in enumerate(nn_idx)]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"dupemap = {features_all.index[k]: [features_all.index[_] for _ in v] for k, v in enumerate(nn_idx) if v}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'007954_184320': ['120195_184320'],\n",
" '069465_549120': ['082089_549120'],\n",
" '074328_372480': ['074658_372480'],\n",
" '074658_372480': ['074328_372480'],\n",
" '082089_549120': ['069465_549120'],\n",
" '103892_130560': ['104838_130560'],\n",
" '104838_130560': ['103892_130560'],\n",
" '116011_341760': ['116322_341760'],\n",
" '116322_341760': ['116011_341760'],\n",
" '116585_46080': ['116586_46080'],\n",
" '116586_46080': ['116585_46080'],\n",
" '120195_184320': ['007954_184320']}"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dupemap"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"skey_to_artist = metadata[['sample_key', 'artist_id']].set_index('sample_key')"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"deduped_artist_keys = skey_to_artist.copy()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"for k in dupemap.keys():\n",
" dupek = dupemap[k][0]\n",
" true_k = min(k, dupek)\n",
" #print('{:15s} : {:8d} ==> {:8d}'.format(k, skey_to_artist.loc[k], skey_to_artist.loc[true_k]))\n",
" deduped_artist_keys.loc[k] = skey_to_artist.loc[true_k]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"deduped_artist_keys.sort_index().to_csv('/home/bmcfee/data/openmic20k/openmic-20k-dedupe-artistids.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.5",
"language": "python",
"name": "python3.5"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment