Skip to content

Instantly share code, notes, and snippets.

@dubeyji10
Forked from achinta/index_gita_in_es.ipynb
Created June 18, 2022 06:52
Show Gist options
  • Save dubeyji10/7ea33f67023c5813dbe1592fc9803e41 to your computer and use it in GitHub Desktop.
Save dubeyji10/7ea33f67023c5813dbe1592fc9803e41 to your computer and use it in GitHub Desktop.
Elastic Search functions
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Purpose\n",
"We parse a txt file of Bhagavadgita slokas and index in elastic search"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from elasticsearch import Elasticsearch\n",
"from pathlib import Path\n",
"import json\n",
"es = Elasticsearch(hosts=['65.1.165.22'])\n",
"index_name = 'gita'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Helper Methods"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def parse_file(file: Path):\n",
" # read file contents\n",
" with open(file.as_posix(),'r') as f:\n",
" lines = f.read()\n",
" \n",
" # parse each file\n",
" datas = []\n",
" data = {}\n",
" for line in lines.split('\\n\\n'):\n",
" data['text'] = line\n",
" words = line.split(\"॥\")\n",
" if len(words) > 1:\n",
" data['verse_num'] = words[1].strip() if words[1].strip() else ''\n",
" else:\n",
" data['verse_num'] = ''\n",
" if len(data['verse_num'].split('.')) > 1:\n",
" chapter_num = data['verse_num'].split('.')[0]\n",
" data['chapter_num'] = chapter_num.strip() if chapter_num else ''\n",
" datas.append(data.copy())\n",
" return datas\n",
"\n",
"def index_data(datas, document_name):\n",
" for idx, data in enumerate(datas):\n",
" data['book'] = document_name\n",
" res = es.index(index=index_name, document=data)\n",
"# if idx%50 == 0:\n",
"# print(f'indexed {idx} docs')\n",
" print(f'number of docs in index {index_name} - {es.count(index=index_name)[\"count\"]}')\n",
" \n",
"def delete_data(match_dict):\n",
" if not es.indices.exists(index=index_name):\n",
" return\n",
" query_body = {\n",
" \"query\": {\n",
" \"match\": match_dict\n",
" }\n",
" }\n",
" res = es.delete_by_query(index=index_name, body=query_body)\n",
" return res\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[PosixPath('/Users/achinta/github/gita-anthology/anugita.txt'), PosixPath('/Users/achinta/github/gita-anthology/bhagavadgita.txt')]\n",
"number of docs in index gita - 1772\n",
"number of docs in index gita - 1769\n"
]
}
],
"source": [
"docs_dir = Path.home()/'github/gita-anthology'\n",
"files = [x for x in docs_dir.glob('*.txt') if x.is_file()]\n",
"print(files)\n",
"\n",
"# parse file\n",
"es.indices.create(index=index_name, ignore=400)\n",
"for file in files:\n",
" datas = parse_file(file)\n",
"\n",
" # delete existing data for file\n",
" delete_data({\"book\": file.stem})\n",
"\n",
" # index new data\n",
" index_data(datas, file.stem)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Count number of records"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1772"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"es.count(index=index_name)[\"count\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Search"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_index': 'gita', '_type': '_doc', '_id': '1WZ7i30BPKOFx3gmSAqb', '_score': 6.5367517, '_source': {'text': 'yogasthaḥ kuru karmāṇi saṅgaṃ tyaktvā dhanañjaya ।\\nsid\\u200cdhyasid\\u200cdhyoḥ samo bhūtvā samatvaṃ yoga ucyate ॥2.48॥', 'verse_num': '2.48', 'chapter_num': '2', 'book': 'bhagavadgita'}}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/achinta/miniconda3/envs/explorer/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
" \n"
]
}
],
"source": [
"query_body = {\n",
" \"query\": {\n",
" \"match\": {\n",
" \"text\": \"yoga\"\n",
" }\n",
" }\n",
"}\n",
"res = es.search(index=index_name, body=query_body)\n",
"ids = [r['_id'] for r in res['hits']['hits']]\n",
"# print(f\"Got Hits: {len(res['hits']['hits'])}\")\n",
"for doc in res['hits']['hits']:\n",
" print(doc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment