Created
June 17, 2021 06:30
-
-
Save amqdn/a8b60cc5d29835de654bcbd463512a10 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"## Welcome to the Jigsaw Toxic Comment Classification Challenge","metadata":{}},{"cell_type":"code","source":"import numpy as np\nimport pandas as pd\nimport altair as alt","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2021-06-17T06:20:50.463156Z","iopub.execute_input":"2021-06-17T06:20:50.463656Z","iopub.status.idle":"2021-06-17T06:20:50.628071Z","shell.execute_reply.started":"2021-06-17T06:20:50.463557Z","shell.execute_reply":"2021-06-17T06:20:50.627068Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"import os\nos.chdir('../input/jigsaw-toxic-comment-classification-challenge/')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:50.629479Z","iopub.execute_input":"2021-06-17T06:20:50.629760Z","iopub.status.idle":"2021-06-17T06:20:50.633034Z","shell.execute_reply.started":"2021-06-17T06:20:50.629734Z","shell.execute_reply":"2021-06-17T06:20:50.632373Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"markdown","source":"We are provided `train.csv`, `test.csv`, and `test_labels.csv`. All of these files also happen to be `.zip`.","metadata":{}},{"cell_type":"code","source":"train = pd.read_csv('train.csv.zip')\ntrain.head()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:50.634656Z","iopub.execute_input":"2021-06-17T06:20:50.635053Z","iopub.status.idle":"2021-06-17T06:20:52.750224Z","shell.execute_reply.started":"2021-06-17T06:20:50.635023Z","shell.execute_reply":"2021-06-17T06:20:52.749325Z"},"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":" id comment_text toxic \\\n0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n\n severe_toxic obscene threat insult identity_hate \n0 0 0 0 0 0 \n1 0 0 0 0 0 \n2 0 0 0 0 0 \n3 0 0 0 0 0 \n4 0 0 0 0 0 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0000997932d777bf</td>\n <td>Explanation\\nWhy the edits made under my usern...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>000103f0d9cfb60f</td>\n <td>D'aww! He matches this background colour I'm s...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>000113f07ec002fd</td>\n <td>Hey man, I'm really not trying to edit war. It...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0001b41b1c6bb37e</td>\n <td>\"\\nMore\\nI can't make any real suggestions on ...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0001d958c54c6e35</td>\n <td>You, sir, are my hero. Any chance you remember...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train.info()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.751677Z","iopub.execute_input":"2021-06-17T06:20:52.752059Z","iopub.status.idle":"2021-06-17T06:20:52.808832Z","shell.execute_reply.started":"2021-06-17T06:20:52.752029Z","shell.execute_reply":"2021-06-17T06:20:52.808155Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 159571 entries, 0 to 159570\nData columns (total 8 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 id 159571 non-null object\n 1 comment_text 159571 non-null object\n 2 toxic 159571 non-null int64 \n 3 severe_toxic 159571 non-null int64 \n 4 obscene 159571 non-null int64 \n 5 threat 159571 non-null int64 \n 6 insult 159571 non-null int64 \n 7 identity_hate 159571 non-null int64 \ndtypes: int64(6), object(2)\nmemory usage: 9.7+ MB\n","output_type":"stream"}]},{"cell_type":"markdown","source":"We don't need the `id` column, so we'll drop that. Let's also sample some examples of each type of class. *Warning: It can get pretty bad.*","metadata":{}},{"cell_type":"code","source":"train = train.drop(columns=['id'])","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.809971Z","iopub.execute_input":"2021-06-17T06:20:52.810221Z","iopub.status.idle":"2021-06-17T06:20:52.824192Z","shell.execute_reply.started":"2021-06-17T06:20:52.810196Z","shell.execute_reply":"2021-06-17T06:20:52.823263Z"},"trusted":true},"execution_count":5,"outputs":[]},{"cell_type":"code","source":"for col in train.columns[1:]:\n print(f'CLASS: {col}')\n print(train.loc[train[col] == 1].sample().comment_text.values[0])\n print()\n print('----')\n print()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.825254Z","iopub.execute_input":"2021-06-17T06:20:52.825534Z","iopub.status.idle":"2021-06-17T06:20:52.867466Z","shell.execute_reply.started":"2021-06-17T06:20:52.825507Z","shell.execute_reply":"2021-06-17T06:20:52.866555Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"CLASS: toxic\nhurt? i'm alright and don't give a damn fool.\n\n----\n\nCLASS: severe_toxic\ndie \n\nplease why dont u do us all a favioe and kill yourslef\n\n----\n\nCLASS: obscene\nYou guys have a lot of balls whining about WP transgressions after the displays of lack of respect for these policies yourselves! Man WP displays it's ignorance and bias for a few childish editors in these examples. Compete hoax. The idiots pushing allthe rules cry realquick when the same techniques are spouted towards them.\n\n----\n\nCLASS: threat\nYOURE GONNA BE A DEAD FAGGOT 66.158.193.58\n\n----\n\nCLASS: insult\nfuck you\n\nfuck you majorly. suck my dick asshole.\n\n----\n\nCLASS: identity_hate\nWhy am I being blocked by an obvious anti-semite? Gareth Hughes, the so called anglican priest is a racist and an anti-semite. Look at everyone of his posts regarding Assyrians? He hates Assyrians, he is a jealous little twit with too much time on his hands and no real education... Anti-Semite!!!\n\n----\n\n","output_type":"stream"}]},{"cell_type":"markdown","source":"Ah, the internet... Let's plot the distribution of labels:","metadata":{}},{"cell_type":"code","source":"label_counts = train.iloc[:, 1:].sum().reset_index().rename(columns={'index': 'Label', 0: 'Count'})\nalt.Chart(label_counts).mark_bar().encode(\n x=alt.X('Label', axis=alt.Axis(labelAngle=-45)),\n y='Count'\n).properties(title='Train Set Label Distribution')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.868661Z","iopub.execute_input":"2021-06-17T06:20:52.868912Z","iopub.status.idle":"2021-06-17T06:20:52.906740Z","shell.execute_reply.started":"2021-06-17T06:20:52.868886Z","shell.execute_reply":"2021-06-17T06:20:52.905756Z"},"trusted":true},"execution_count":7,"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/html":"\n<div id=\"altair-viz-f9d85869fa82461a9ad261a73ee198b0\"></div>\n<script type=\"text/javascript\">\n (function(spec, embedOpt){\n let outputDiv = document.currentScript.previousElementSibling;\n if (outputDiv.id !== \"altair-viz-f9d85869fa82461a9ad261a73ee198b0\") {\n outputDiv = document.getElementById(\"altair-viz-f9d85869fa82461a9ad261a73ee198b0\");\n }\n const paths = {\n \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n \"vega-lite\": \"https://cdn.jsdelivr.net/npm//[email protected]?noext\",\n \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n };\n\n function loadScript(lib) {\n return new Promise(function(resolve, reject) {\n var s = document.createElement('script');\n s.src = paths[lib];\n s.async = true;\n s.onload = () => resolve(paths[lib]);\n s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n document.getElementsByTagName(\"head\")[0].appendChild(s);\n });\n }\n\n function showError(err) {\n outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n throw err;\n }\n\n function displayChart(vegaEmbed) {\n vegaEmbed(outputDiv, spec, embedOpt)\n .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n }\n\n if(typeof define === \"function\" && define.amd) {\n requirejs.config({paths});\n require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n } else if (typeof vegaEmbed === \"function\") {\n displayChart(vegaEmbed);\n } else {\n loadScript(\"vega\")\n .then(() => loadScript(\"vega-lite\"))\n .then(() => loadScript(\"vega-embed\"))\n .catch(showError)\n .then(() => displayChart(vegaEmbed));\n }\n })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-c17690826a29f3a34e7037bd2199575d\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"type\": \"nominal\", \"axis\": {\"labelAngle\": -45}, \"field\": \"Label\"}, \"y\": {\"type\": \"quantitative\", \"field\": \"Count\"}}, \"title\": \"Train Set Label Distribution\", \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.8.1.json\", \"datasets\": {\"data-c17690826a29f3a34e7037bd2199575d\": [{\"Label\": \"toxic\", \"Count\": 15294}, {\"Label\": \"severe_toxic\", \"Count\": 1595}, {\"Label\": \"obscene\", \"Count\": 8449}, {\"Label\": \"threat\", \"Count\": 478}, {\"Label\": \"insult\", \"Count\": 7877}, {\"Label\": \"identity_hate\", \"Count\": 1405}]}}, {\"mode\": \"vega-lite\"});\n</script>","text/plain":"alt.Chart(...)"},"metadata":{}}]},{"cell_type":"code","source":"test = pd.read_csv('test.csv.zip')\ntest.head()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:52.909679Z","iopub.execute_input":"2021-06-17T06:20:52.910091Z","iopub.status.idle":"2021-06-17T06:20:54.614564Z","shell.execute_reply.started":"2021-06-17T06:20:52.910049Z","shell.execute_reply":"2021-06-17T06:20:54.613590Z"},"trusted":true},"execution_count":8,"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" id comment_text\n0 00001cee341fdb12 Yo bitch Ja Rule is more succesful then you'll...\n1 0000247867823ef7 == From RfC == \\n\\n The title is fine as it is...\n2 00013b17ad220c46 \" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...\n3 00017563c3f7919a :If you have a look back at the source, the in...\n4 00017695ad8997eb I don't anonymously edit articles at all.","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>comment_text</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>00001cee341fdb12</td>\n <td>Yo bitch Ja Rule is more succesful then you'll...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0000247867823ef7</td>\n <td>== From RfC == \\n\\n The title is fine as it is...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>00013b17ad220c46</td>\n <td>\" \\n\\n == Sources == \\n\\n * Zawe Ashton on Lap...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>00017563c3f7919a</td>\n <td>:If you have a look back at the source, the in...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>00017695ad8997eb</td>\n <td>I don't anonymously edit articles at all.</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test_labels = pd.read_csv('test_labels.csv.zip')\ntest_labels.head()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.616392Z","iopub.execute_input":"2021-06-17T06:20:54.616785Z","iopub.status.idle":"2021-06-17T06:20:54.841431Z","shell.execute_reply.started":"2021-06-17T06:20:54.616749Z","shell.execute_reply":"2021-06-17T06:20:54.840803Z"},"trusted":true},"execution_count":9,"outputs":[{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" id toxic severe_toxic obscene threat insult \\\n0 00001cee341fdb12 -1 -1 -1 -1 -1 \n1 0000247867823ef7 -1 -1 -1 -1 -1 \n2 00013b17ad220c46 -1 -1 -1 -1 -1 \n3 00017563c3f7919a -1 -1 -1 -1 -1 \n4 00017695ad8997eb -1 -1 -1 -1 -1 \n\n identity_hate \n0 -1 \n1 -1 \n2 -1 \n3 -1 \n4 -1 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>00001cee341fdb12</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0000247867823ef7</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>00013b17ad220c46</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>00017563c3f7919a</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>00017695ad8997eb</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n <td>-1</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"That's weird. Why are the test labels all `-1`?","metadata":{}},{"cell_type":"code","source":"test_labels.describe()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.842449Z","iopub.execute_input":"2021-06-17T06:20:54.842749Z","iopub.status.idle":"2021-06-17T06:20:54.905492Z","shell.execute_reply.started":"2021-06-17T06:20:54.842721Z","shell.execute_reply":"2021-06-17T06:20:54.904579Z"},"trusted":true},"execution_count":10,"outputs":[{"execution_count":10,"output_type":"execute_result","data":{"text/plain":" toxic severe_toxic obscene threat \\\ncount 153164.000000 153164.000000 153164.000000 153164.000000 \nmean -0.542530 -0.579895 -0.558193 -0.580913 \nstd 0.572465 0.498408 0.542966 0.496195 \nmin -1.000000 -1.000000 -1.000000 -1.000000 \n25% -1.000000 -1.000000 -1.000000 -1.000000 \n50% -1.000000 -1.000000 -1.000000 -1.000000 \n75% 0.000000 0.000000 0.000000 0.000000 \nmax 1.000000 1.000000 1.000000 1.000000 \n\n insult identity_hate \ncount 153164.000000 153164.000000 \nmean -0.559916 -0.577642 \nstd 0.539594 0.503260 \nmin -1.000000 -1.000000 \n25% -1.000000 -1.000000 \n50% -1.000000 -1.000000 \n75% 0.000000 0.000000 \nmax 1.000000 1.000000 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>-0.542530</td>\n <td>-0.579895</td>\n <td>-0.558193</td>\n <td>-0.580913</td>\n <td>-0.559916</td>\n <td>-0.577642</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.572465</td>\n <td>0.498408</td>\n <td>0.542966</td>\n <td>0.496195</td>\n <td>0.539594</td>\n <td>0.503260</td>\n </tr>\n <tr>\n <th>min</th>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n <td>-1.000000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"train.describe()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.906820Z","iopub.execute_input":"2021-06-17T06:20:54.907165Z","iopub.status.idle":"2021-06-17T06:20:54.961386Z","shell.execute_reply.started":"2021-06-17T06:20:54.907126Z","shell.execute_reply":"2021-06-17T06:20:54.960265Z"},"trusted":true},"execution_count":11,"outputs":[{"execution_count":11,"output_type":"execute_result","data":{"text/plain":" toxic severe_toxic obscene threat \\\ncount 159571.000000 159571.000000 159571.000000 159571.000000 \nmean 0.095844 0.009996 0.052948 0.002996 \nstd 0.294379 0.099477 0.223931 0.054650 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.000000 0.000000 0.000000 0.000000 \n50% 0.000000 0.000000 0.000000 0.000000 \n75% 0.000000 0.000000 0.000000 0.000000 \nmax 1.000000 1.000000 1.000000 1.000000 \n\n insult identity_hate \ncount 159571.000000 159571.000000 \nmean 0.049364 0.008805 \nstd 0.216627 0.093420 \nmin 0.000000 0.000000 \n25% 0.000000 0.000000 \n50% 0.000000 0.000000 \n75% 0.000000 0.000000 \nmax 1.000000 1.000000 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>159571.000000</td>\n <td>159571.000000</td>\n <td>159571.000000</td>\n <td>159571.000000</td>\n <td>159571.000000</td>\n <td>159571.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.095844</td>\n <td>0.009996</td>\n <td>0.052948</td>\n <td>0.002996</td>\n <td>0.049364</td>\n <td>0.008805</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.294379</td>\n <td>0.099477</td>\n <td>0.223931</td>\n <td>0.054650</td>\n <td>0.216627</td>\n <td>0.093420</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"markdown","source":"This is another good reason to `.describe()` any df because weird values will likely not show up in `.head()`. Let's fix this now:","metadata":{}},{"cell_type":"code","source":"test_labels = test_labels.drop(columns=['id'])\nfor col in test_labels.columns:\n test_labels[col].loc[test_labels[col] == -1] = 0\ntest_labels.describe()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:54.963773Z","iopub.execute_input":"2021-06-17T06:20:54.964199Z","iopub.status.idle":"2021-06-17T06:20:55.033905Z","shell.execute_reply.started":"2021-06-17T06:20:54.964159Z","shell.execute_reply":"2021-06-17T06:20:55.029310Z"},"trusted":true},"execution_count":12,"outputs":[{"execution_count":12,"output_type":"execute_result","data":{"text/plain":" toxic severe_toxic obscene threat \\\ncount 153164.000000 153164.000000 153164.000000 153164.000000 \nmean 0.039761 0.002396 0.024098 0.001378 \nstd 0.195399 0.048892 0.153355 0.037091 \nmin 0.000000 0.000000 0.000000 0.000000 \n25% 0.000000 0.000000 0.000000 0.000000 \n50% 0.000000 0.000000 0.000000 0.000000 \n75% 0.000000 0.000000 0.000000 0.000000 \nmax 1.000000 1.000000 1.000000 1.000000 \n\n insult identity_hate \ncount 153164.000000 153164.000000 \nmean 0.022375 0.004649 \nstd 0.147899 0.068022 \nmin 0.000000 0.000000 \n25% 0.000000 0.000000 \n50% 0.000000 0.000000 \n75% 0.000000 0.000000 \nmax 1.000000 1.000000 ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>toxic</th>\n <th>severe_toxic</th>\n <th>obscene</th>\n <th>threat</th>\n <th>insult</th>\n <th>identity_hate</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n <td>153164.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>0.039761</td>\n <td>0.002396</td>\n <td>0.024098</td>\n <td>0.001378</td>\n <td>0.022375</td>\n <td>0.004649</td>\n </tr>\n <tr>\n <th>std</th>\n <td>0.195399</td>\n <td>0.048892</td>\n <td>0.153355</td>\n <td>0.037091</td>\n <td>0.147899</td>\n <td>0.068022</td>\n </tr>\n <tr>\n <th>min</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n <td>1.000000</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"test.info()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.035358Z","iopub.execute_input":"2021-06-17T06:20:55.035773Z","iopub.status.idle":"2021-06-17T06:20:55.100754Z","shell.execute_reply.started":"2021-06-17T06:20:55.035730Z","shell.execute_reply":"2021-06-17T06:20:55.099622Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 153164 entries, 0 to 153163\nData columns (total 2 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 id 153164 non-null object\n 1 comment_text 153164 non-null object\ndtypes: object(2)\nmemory usage: 2.3+ MB\n","output_type":"stream"}]},{"cell_type":"code","source":"test_labels.info()","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.102308Z","iopub.execute_input":"2021-06-17T06:20:55.102719Z","iopub.status.idle":"2021-06-17T06:20:55.122570Z","shell.execute_reply.started":"2021-06-17T06:20:55.102678Z","shell.execute_reply":"2021-06-17T06:20:55.121521Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 153164 entries, 0 to 153163\nData columns (total 6 columns):\n # Column Non-Null Count Dtype\n--- ------ -------------- -----\n 0 toxic 153164 non-null int64\n 1 severe_toxic 153164 non-null int64\n 2 obscene 153164 non-null int64\n 3 threat 153164 non-null int64\n 4 insult 153164 non-null int64\n 5 identity_hate 153164 non-null int64\ndtypes: int64(6)\nmemory usage: 7.0 MB\n","output_type":"stream"}]},{"cell_type":"code","source":"label_counts = test_labels.sum().reset_index().rename(columns={'index': 'Label', 0: 'Count'})\nalt.Chart(label_counts).mark_bar().encode(\n x=alt.X('Label', axis=alt.Axis(labelAngle=-45)),\n y='Count'\n).properties(title='Dev Set Label Distribution')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.124194Z","iopub.execute_input":"2021-06-17T06:20:55.124622Z","iopub.status.idle":"2021-06-17T06:20:55.155742Z","shell.execute_reply.started":"2021-06-17T06:20:55.124572Z","shell.execute_reply":"2021-06-17T06:20:55.154728Z"},"trusted":true},"execution_count":15,"outputs":[{"execution_count":15,"output_type":"execute_result","data":{"text/html":"\n<div id=\"altair-viz-0695990f152a428b88b61dcd24bf9877\"></div>\n<script type=\"text/javascript\">\n (function(spec, embedOpt){\n let outputDiv = document.currentScript.previousElementSibling;\n if (outputDiv.id !== \"altair-viz-0695990f152a428b88b61dcd24bf9877\") {\n outputDiv = document.getElementById(\"altair-viz-0695990f152a428b88b61dcd24bf9877\");\n }\n const paths = {\n \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n \"vega-lite\": \"https://cdn.jsdelivr.net/npm//[email protected]?noext\",\n \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n };\n\n function loadScript(lib) {\n return new Promise(function(resolve, reject) {\n var s = document.createElement('script');\n s.src = paths[lib];\n s.async = true;\n s.onload = () => resolve(paths[lib]);\n s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n document.getElementsByTagName(\"head\")[0].appendChild(s);\n });\n }\n\n function showError(err) {\n outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n throw err;\n }\n\n function displayChart(vegaEmbed) {\n vegaEmbed(outputDiv, spec, embedOpt)\n .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n }\n\n if(typeof define === \"function\" && define.amd) {\n requirejs.config({paths});\n require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n } else if (typeof vegaEmbed === \"function\") {\n displayChart(vegaEmbed);\n } else {\n loadScript(\"vega\")\n .then(() => loadScript(\"vega-lite\"))\n .then(() => loadScript(\"vega-embed\"))\n .catch(showError)\n .then(() => displayChart(vegaEmbed));\n }\n })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-99a28f472e6fe776ccd3eb7e0bc0aa85\"}, \"mark\": \"bar\", \"encoding\": {\"x\": {\"type\": \"nominal\", \"axis\": {\"labelAngle\": -45}, \"field\": \"Label\"}, \"y\": {\"type\": \"quantitative\", \"field\": \"Count\"}}, \"title\": \"Dev Set Label Distribution\", \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.8.1.json\", \"datasets\": {\"data-99a28f472e6fe776ccd3eb7e0bc0aa85\": [{\"Label\": \"toxic\", \"Count\": 6090}, {\"Label\": \"severe_toxic\", \"Count\": 367}, {\"Label\": \"obscene\", \"Count\": 3691}, {\"Label\": \"threat\", \"Count\": 211}, {\"Label\": \"insult\", \"Count\": 3427}, {\"Label\": \"identity_hate\", \"Count\": 712}]}}, {\"mode\": \"vega-lite\"});\n</script>","text/plain":"alt.Chart(...)"},"metadata":{}}]},{"cell_type":"markdown","source":"The distribution of labels between the train and dev sets are about the same.","metadata":{}},{"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer\ntfidf = TfidfVectorizer()\nxt = tfidf.fit_transform(train.comment_text)\nxd = tfidf.transform(test.comment_text)","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:20:55.157054Z","iopub.execute_input":"2021-06-17T06:20:55.157324Z","iopub.status.idle":"2021-06-17T06:21:20.565989Z","shell.execute_reply.started":"2021-06-17T06:20:55.157298Z","shell.execute_reply":"2021-06-17T06:21:20.564925Z"},"trusted":true},"execution_count":16,"outputs":[]},{"cell_type":"markdown","source":"This is a multi-label classification problem, where each example can have any number of the given labels. So let's set targets for every single one and train a classifer for each target separately. ","metadata":{}},{"cell_type":"code","source":"yt1, yd1 = train.toxic, test_labels.toxic\nyt2, yd2 = train.severe_toxic, test_labels.severe_toxic\nyt3, yd3 = train.obscene, test_labels.obscene\nyt4, yd4 = train.threat, test_labels.threat\nyt5, yd5 = train.insult, test_labels.insult\nyt6, yd6 = train.identity_hate, test_labels.identity_hate","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:21:20.567269Z","iopub.execute_input":"2021-06-17T06:21:20.567575Z","iopub.status.idle":"2021-06-17T06:21:20.572533Z","shell.execute_reply.started":"2021-06-17T06:21:20.567545Z","shell.execute_reply":"2021-06-17T06:21:20.571855Z"},"trusted":true},"execution_count":17,"outputs":[]},{"cell_type":"code","source":"from sklearn.linear_model import LogisticRegression\n\nlr1 = LogisticRegression(max_iter=200) # doesn't converge at 100 iters\nlr2 = LogisticRegression()\nlr3 = LogisticRegression()\nlr4 = LogisticRegression()\nlr5 = LogisticRegression()\nlr6 = LogisticRegression()\n\nlr1.fit(xt, yt1)\nlr2.fit(xt, yt2)\nlr3.fit(xt, yt3)\nlr4.fit(xt, yt4)\nlr5.fit(xt, yt5)\nlr6.fit(xt, yt6)","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:21:20.573592Z","iopub.execute_input":"2021-06-17T06:21:20.573956Z","iopub.status.idle":"2021-06-17T06:22:28.384752Z","shell.execute_reply.started":"2021-06-17T06:21:20.573919Z","shell.execute_reply":"2021-06-17T06:22:28.383712Z"},"trusted":true},"execution_count":18,"outputs":[{"execution_count":18,"output_type":"execute_result","data":{"text/plain":"LogisticRegression()"},"metadata":{}}]},{"cell_type":"code","source":"from sklearn.metrics import accuracy_score\n\nprint(f'Acc on toxic: {accuracy_score(lr1.predict(xd), yd1)}')\nprint(f'Acc on severe_toxic: {accuracy_score(lr2.predict(xd), yd2)}')\nprint(f'Acc on obscene: {accuracy_score(lr3.predict(xd), yd3)}')\nprint(f'Acc on threat: {accuracy_score(lr4.predict(xd), yd4)}')\nprint(f'Acc on insult: {accuracy_score(lr5.predict(xd), yd5)}')\nprint(f'Acc on identity_hate: {accuracy_score(lr6.predict(xd), yd6)}')","metadata":{"execution":{"iopub.status.busy":"2021-06-17T06:22:28.386226Z","iopub.execute_input":"2021-06-17T06:22:28.386648Z","iopub.status.idle":"2021-06-17T06:22:28.626211Z","shell.execute_reply.started":"2021-06-17T06:22:28.386591Z","shell.execute_reply":"2021-06-17T06:22:28.625118Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Acc on toxic: 0.8568593141991591\nAcc on severe_toxic: 0.9915254237288136\nAcc on obscene: 0.9175328406152882\nAcc on threat: 0.9979172651536915\nAcc on insult: 0.9340184377529968\nAcc on identity_hate: 0.9916951764122117\n","output_type":"stream"}]},{"cell_type":"markdown","source":"We can guess that the `toxic` class is more difficult merely because the variance of terms is likely high. Not bad for TF-IDF.","metadata":{}}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment