Created
February 25, 2018 02:50
-
-
Save JnBrymn-EB/ef4e20a5794443b4d981fb7f778fbe6f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#bin/elasticsearch -Dhttp.port=9201 -Dtransport.tcp.port=9301\n", | |
"from collections import Counter\n", | |
"\n", | |
"import pandas as pd\n", | |
"from elasticsearch import Elasticsearch, helpers\n", | |
"es = Elasticsearch('localhost:9201') # 9201!!!\n", | |
"es.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sample_submission = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/sample_submission.csv')\n", | |
"test = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/test.csv')\n", | |
"train = pd.read_csv('/Users/johnb/Personal/data_science/kaggle/toxic-comment/data/train.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Index" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"es.indices.delete(index, ignore=404)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"index = 'toxic_comments'\n", | |
"doc_type = 'comment'\n", | |
"settings = {\n", | |
" 'settings': {\n", | |
" 'number_of_shards': 1,\n", | |
" 'number_of_replicas': 0,\n", | |
" },\n", | |
" 'mappings': {\n", | |
" doc_type: {\n", | |
" '_all': {'enabled': False},\n", | |
" 'properties': {\n", | |
" 'comment_text': {\n", | |
" 'type': 'string',\n", | |
" 'analyzer': 'standard',\n", | |
" 'copy_to': 'comment_text_english',\n", | |
" },\n", | |
" 'comment_text_english': {\n", | |
" 'type': 'string',\n", | |
" 'analyzer': 'english',\n", | |
" },\n", | |
" 'toxic': {\n", | |
" 'type': 'boolean',\n", | |
" },\n", | |
" 'severe_toxic': {\n", | |
" 'type': 'boolean',\n", | |
" },\n", | |
" 'obscene': {\n", | |
" 'type': 'boolean',\n", | |
" },\n", | |
" 'threat': {\n", | |
" 'type': 'boolean',\n", | |
" },\n", | |
" 'insult': {\n", | |
" 'type': 'boolean',\n", | |
" },\n", | |
" 'identity_hate': {\n", | |
" 'type': 'boolean',\n", | |
" },\n", | |
" }\n", | |
" }\n", | |
" },\n", | |
"}\n", | |
"es.indices.create(index, body=settings)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def make_action(docs, index, op_type='update', type='event'):\n", | |
" for doc in docs:\n", | |
" if op_type == 'update':\n", | |
" action = {\n", | |
" '_op_type': op_type,\n", | |
" '_index': index,\n", | |
" '_type': type,\n", | |
" 'doc': doc,\n", | |
" 'doc_as_upsert': True\n", | |
" }\n", | |
" elif op_type == 'create':\n", | |
" action = {\n", | |
" '_op_type': op_type,\n", | |
" '_index': index,\n", | |
" '_type': type,\n", | |
" '_source': doc\n", | |
" }\n", | |
" if 'id' in doc:\n", | |
" action['_id'] = doc['id']\n", | |
" yield action\n", | |
" \n", | |
" \n", | |
"def get_training_iterator():\n", | |
" for index, row in train.iterrows():\n", | |
" yield row.to_dict()\n", | |
"\n", | |
"docs = get_training_iterator()\n", | |
"actions = make_action(docs, index, 'create', 'comment')\n", | |
"\n", | |
"details = []\n", | |
"count = 0\n", | |
"for ok, detail in helpers.streaming_bulk(es, actions):\n", | |
" count += 1\n", | |
" if not count % 1000:\n", | |
" print(count)\n", | |
" if not ok:\n", | |
" details.append[detail]\n", | |
"\n", | |
"print(len(details))\n", | |
"\n", | |
"es.indices.forcemerge(index='toxic_comments', max_num_segments=1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Query" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"size = 25\n", | |
"\n", | |
"def make_query(row):\n", | |
" body = {\n", | |
" 'query': {\n", | |
" 'multi_match': {\n", | |
" 'query': row['comment_text'],\n", | |
" 'fields': ['comment_text', 'comment_text_english'],\n", | |
" 'type': 'best_fields',\n", | |
" }\n", | |
" },\n", | |
" 'size': size,\n", | |
" 'fields': ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],\n", | |
" }\n", | |
" return body\n", | |
"\n", | |
"def process_response(row, resp):\n", | |
" if 'hits' not in resp or resp['hits']['total'] == 0:\n", | |
" print('no hits for {}'.format(row['id']))\n", | |
" print(resp)\n", | |
" answer = dict(zip(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], [0,0,0,0,0,0]))\n", | |
" answer['id'] = row['id']\n", | |
" return answer\n", | |
" hits = [hit['fields'] for hit in resp['hits']['hits']]\n", | |
" counter = Counter()\n", | |
" for hit in hits:\n", | |
" for k,v in hit.items():\n", | |
" hit[k] = v[0]\n", | |
" counter.update(hit)\n", | |
" for k,v in counter.items():\n", | |
" counter[k] /= size\n", | |
" answer = dict(counter)\n", | |
" answer['id'] = row['id']\n", | |
" return answer\n", | |
"\n", | |
"def get_test_iterator():\n", | |
" for index, row in test.iterrows():\n", | |
" yield row.to_dict()\n", | |
" \n", | |
"def get_batch_iterator(row_iterator, batch_size):\n", | |
" items = []\n", | |
" for item in row_iterator:\n", | |
" items.append(item)\n", | |
" if len(items) == batch_size:\n", | |
" yield items\n", | |
" items = []\n", | |
" yield items\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"test_iterator = get_test_iterator()\n", | |
"\n", | |
"batch_size = 13*7\n", | |
"batch_count = 0\n", | |
"report_count = 1000\n", | |
"report_batch = int(report_count/batch_size)\n", | |
"\n", | |
"answers = []\n", | |
"for batch in get_batch_iterator(test_iterator, batch_size):\n", | |
" headers = [{'index': index, 'type': doc_type} for _ in range(batch_size)]\n", | |
" queries = [make_query(row) for row in batch]\n", | |
" msearch_request = []\n", | |
"\n", | |
" for header, query in zip(headers, queries):\n", | |
" msearch_request.append(header)\n", | |
" msearch_request.append(query)\n", | |
"\n", | |
" responses = es.msearch(msearch_request)['responses']\n", | |
"\n", | |
" answers.extend([process_response(row, resp) for row, resp in zip(batch, responses)])\n", | |
" \n", | |
" batch_count += 1\n", | |
" if not batch_count % report_batch:\n", | |
" print(batch_count*batch_size)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"answers = pd.DataFrame(answers).reindex_axis(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"answers.to_csv('../first_submission.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Parameters to explore\n", | |
"* k (in k-NN) and k might be different per target dimenstion\n", | |
"* boosting different fields\n", | |
"* different analysis chains - like keeping CAPS and dropping very uncommon words (which score highly)\n", | |
"* multimatch type\n", | |
"* do something about punctuation and about repeditivity of text" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment