Created
September 3, 2018 20:09
-
-
Save bmcfee/248a74c7f8ccaad611525ff27677eb16 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import ujson as json\n", | |
| "from tqdm import tqdm_notebook as tqdm" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib widget\n", | |
| "import matplotlib.pyplot as plt" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from tqdm import tqdm_notebook as tqdm\n", | |
| "from sklearn.model_selection import StratifiedShuffleSplit\n", | |
| "import seaborn as sns" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "metadata = pd.read_csv('/home/bmcfee/data/openmic20k/openmic-20k-metadata.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "label_data = pd.read_csv('/home/bmcfee/data/openmic20k/openmic-20k-sparse-labels-20180805.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "instruments = sorted(pd.unique(label_data['instrument']))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "63b1c513b5d6484a8f8b06ccd275ef3e", | |
| "version_major": 2, | |
| "version_minor": 0 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "features = dict()\n", | |
| "for key in tqdm(metadata['sample_key']):\n", | |
| " features[key] = np.ravel(np.asarray(json.load(open('/home/bmcfee/data/openmic20k/vggish/{}/{}.json'.format(key[:3], key), 'r'))['features'], dtype=np.uint8))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "features_all = pd.DataFrame.from_dict(features, orient='index')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>0</th>\n", | |
| " <th>1</th>\n", | |
| " <th>2</th>\n", | |
| " <th>3</th>\n", | |
| " <th>4</th>\n", | |
| " <th>5</th>\n", | |
| " <th>6</th>\n", | |
| " <th>7</th>\n", | |
| " <th>8</th>\n", | |
| " <th>9</th>\n", | |
| " <th>...</th>\n", | |
| " <th>1270</th>\n", | |
| " <th>1271</th>\n", | |
| " <th>1272</th>\n", | |
| " <th>1273</th>\n", | |
| " <th>1274</th>\n", | |
| " <th>1275</th>\n", | |
| " <th>1276</th>\n", | |
| " <th>1277</th>\n", | |
| " <th>1278</th>\n", | |
| " <th>1279</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>017059_46080</th>\n", | |
| " <td>186</td>\n", | |
| " <td>38</td>\n", | |
| " <td>241</td>\n", | |
| " <td>76</td>\n", | |
| " <td>164</td>\n", | |
| " <td>31</td>\n", | |
| " <td>62</td>\n", | |
| " <td>63</td>\n", | |
| " <td>149</td>\n", | |
| " <td>223</td>\n", | |
| " <td>...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>138</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>168</td>\n", | |
| " <td>255</td>\n", | |
| " <td>255</td>\n", | |
| " <td>83</td>\n", | |
| " <td>255</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>041800_495360</th>\n", | |
| " <td>177</td>\n", | |
| " <td>6</td>\n", | |
| " <td>149</td>\n", | |
| " <td>97</td>\n", | |
| " <td>178</td>\n", | |
| " <td>95</td>\n", | |
| " <td>99</td>\n", | |
| " <td>119</td>\n", | |
| " <td>160</td>\n", | |
| " <td>175</td>\n", | |
| " <td>...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>227</td>\n", | |
| " <td>121</td>\n", | |
| " <td>161</td>\n", | |
| " <td>84</td>\n", | |
| " <td>119</td>\n", | |
| " <td>117</td>\n", | |
| " <td>4</td>\n", | |
| " <td>0</td>\n", | |
| " <td>255</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>095447_364800</th>\n", | |
| " <td>185</td>\n", | |
| " <td>29</td>\n", | |
| " <td>175</td>\n", | |
| " <td>114</td>\n", | |
| " <td>202</td>\n", | |
| " <td>49</td>\n", | |
| " <td>110</td>\n", | |
| " <td>136</td>\n", | |
| " <td>129</td>\n", | |
| " <td>229</td>\n", | |
| " <td>...</td>\n", | |
| " <td>48</td>\n", | |
| " <td>204</td>\n", | |
| " <td>203</td>\n", | |
| " <td>1</td>\n", | |
| " <td>107</td>\n", | |
| " <td>0</td>\n", | |
| " <td>255</td>\n", | |
| " <td>52</td>\n", | |
| " <td>43</td>\n", | |
| " <td>255</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>3 rows × 1280 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " 0 1 2 3 4 5 6 7 8 9 \\\n", | |
| "017059_46080 186 38 241 76 164 31 62 63 149 223 \n", | |
| "041800_495360 177 6 149 97 178 95 99 119 160 175 \n", | |
| "095447_364800 185 29 175 114 202 49 110 136 129 229 \n", | |
| "\n", | |
| " ... 1270 1271 1272 1273 1274 1275 1276 1277 1278 \\\n", | |
| "017059_46080 ... 0 138 0 0 0 168 255 255 83 \n", | |
| "041800_495360 ... 0 227 121 161 84 119 117 4 0 \n", | |
| "095447_364800 ... 48 204 203 1 107 0 255 52 43 \n", | |
| "\n", | |
| " 1279 \n", | |
| "017059_46080 255 \n", | |
| "041800_495360 255 \n", | |
| "095447_364800 255 \n", | |
| "\n", | |
| "[3 rows x 1280 columns]" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "features_all.head(3)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.neighbors import NearestNeighbors" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "NN = NearestNeighbors(n_neighbors=10, p=1, radius=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',\n", | |
| " metric_params=None, n_jobs=1, n_neighbors=10, p=1, radius=1)" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "NN.fit(features_all)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "nn_dist, nn_idx = NN.radius_neighbors(features_all)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "nn_idx = [set(_) - set([__]) for __, _ in enumerate(nn_idx)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dupemap = {features_all.index[k]: [features_all.index[_] for _ in v] for k, v in enumerate(nn_idx) if v}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'007954_184320': ['120195_184320'],\n", | |
| " '069465_549120': ['082089_549120'],\n", | |
| " '074328_372480': ['074658_372480'],\n", | |
| " '074658_372480': ['074328_372480'],\n", | |
| " '082089_549120': ['069465_549120'],\n", | |
| " '103892_130560': ['104838_130560'],\n", | |
| " '104838_130560': ['103892_130560'],\n", | |
| " '116011_341760': ['116322_341760'],\n", | |
| " '116322_341760': ['116011_341760'],\n", | |
| " '116585_46080': ['116586_46080'],\n", | |
| " '116586_46080': ['116585_46080'],\n", | |
| " '120195_184320': ['007954_184320']}" | |
| ] | |
| }, | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dupemap" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 62, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "skey_to_artist = metadata[['sample_key', 'artist_id']].set_index('sample_key')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 63, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "deduped_artist_keys = skey_to_artist.copy()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 65, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for k in dupemap.keys():\n", | |
| " dupek = dupemap[k][0]\n", | |
| " true_k = min(k, dupek)\n", | |
| " #print('{:15s} : {:8d} ==> {:8d}'.format(k, skey_to_artist.loc[k], skey_to_artist.loc[true_k]))\n", | |
| " deduped_artist_keys.loc[k] = skey_to_artist.loc[true_k]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 67, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "deduped_artist_keys.sort_index().to_csv('/home/bmcfee/data/openmic20k/openmic-20k-dedupe-artistids.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3.5", | |
| "language": "python", | |
| "name": "python3.5" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.5" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment