-
-
Save dubeyji10/7ea33f67023c5813dbe1592fc9803e41 to your computer and use it in GitHub Desktop.
Elastic Search functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Purpose\n", | |
"We parse a txt file of Bhagavadgita slokas and index in elastic search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from elasticsearch import Elasticsearch\n", | |
"from pathlib import Path\n", | |
"import json\n", | |
"es = Elasticsearch(hosts=['65.1.165.22'])\n", | |
"index_name = 'gita'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Helper Methods" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def parse_file(file: Path):\n", | |
" # read file contents\n", | |
" with open(file.as_posix(),'r') as f:\n", | |
" lines = f.read()\n", | |
" \n", | |
" # parse each file\n", | |
" datas = []\n", | |
" data = {}\n", | |
" for line in lines.split('\\n\\n'):\n", | |
" data['text'] = line\n", | |
" words = line.split(\"॥\")\n", | |
" if len(words) > 1:\n", | |
" data['verse_num'] = words[1].strip() if words[1].strip() else ''\n", | |
" else:\n", | |
" data['verse_num'] = ''\n", | |
" if len(data['verse_num'].split('.')) > 1:\n", | |
" chapter_num = data['verse_num'].split('.')[0]\n", | |
" data['chapter_num'] = chapter_num.strip() if chapter_num else ''\n", | |
" datas.append(data.copy())\n", | |
" return datas\n", | |
"\n", | |
"def index_data(datas, document_name):\n", | |
" for idx, data in enumerate(datas):\n", | |
" data['book'] = document_name\n", | |
" res = es.index(index=index_name, document=data)\n", | |
"# if idx%50 == 0:\n", | |
"# print(f'indexed {idx} docs')\n", | |
" print(f'number of docs in index {index_name} - {es.count(index=index_name)[\"count\"]}')\n", | |
" \n", | |
"def delete_data(match_dict):\n", | |
" if not es.indices.exists(index=index_name):\n", | |
" return\n", | |
" query_body = {\n", | |
" \"query\": {\n", | |
" \"match\": match_dict\n", | |
" }\n", | |
" }\n", | |
" res = es.delete_by_query(index=index_name, body=query_body)\n", | |
" return res\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[PosixPath('/Users/achinta/github/gita-anthology/anugita.txt'), PosixPath('/Users/achinta/github/gita-anthology/bhagavadgita.txt')]\n", | |
"number of docs in index gita - 1772\n", | |
"number of docs in index gita - 1769\n" | |
] | |
} | |
], | |
"source": [ | |
"docs_dir = Path.home()/'github/gita-anthology'\n", | |
"files = [x for x in docs_dir.glob('*.txt') if x.is_file()]\n", | |
"print(files)\n", | |
"\n", | |
"# parse file\n", | |
"es.indices.create(index=index_name, ignore=400)\n", | |
"for file in files:\n", | |
" datas = parse_file(file)\n", | |
"\n", | |
" # delete existing data for file\n", | |
" delete_data({\"book\": file.stem})\n", | |
"\n", | |
" # index new data\n", | |
" index_data(datas, file.stem)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Count number of records" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1772" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"es.count(index=index_name)[\"count\"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'_index': 'gita', '_type': '_doc', '_id': '1WZ7i30BPKOFx3gmSAqb', '_score': 6.5367517, '_source': {'text': 'yogasthaḥ kuru karmāṇi saṅgaṃ tyaktvā dhanañjaya ।\\nsid\\u200cdhyasid\\u200cdhyoḥ samo bhūtvā samatvaṃ yoga ucyate ॥2.48॥', 'verse_num': '2.48', 'chapter_num': '2', 'book': 'bhagavadgita'}}\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/Users/achinta/miniconda3/envs/explorer/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n", | |
" \n" | |
] | |
} | |
], | |
"source": [ | |
"query_body = {\n", | |
" \"query\": {\n", | |
" \"match\": {\n", | |
" \"text\": \"yoga\"\n", | |
" }\n", | |
" }\n", | |
"}\n", | |
"res = es.search(index=index_name, body=query_body)\n", | |
"ids = [r['_id'] for r in res['hits']['hits']]\n", | |
"# print(f\"Got Hits: {len(res['hits']['hits'])}\")\n", | |
"for doc in res['hits']['hits']:\n", | |
" print(doc)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment