dubeyji10 · June 18, 2022 06:52
diff --git a/index_gita_in_es.ipynb b/index_gita_in_es.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Purpose\n",
    "We parse a txt file of Bhagavadgita slokas and index in elastic search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "from pathlib import Path\n",
    "import json\n",
    "es = Elasticsearch(hosts=['65.1.165.22'])\n",
    "index_name = 'gita'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Helper Methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_file(file: Path):\n",
    "    # read file contents\n",
    "    with open(file.as_posix(),'r') as f:\n",
    "        lines = f.read()\n",
    "    \n",
    "    # parse each file\n",
    "    datas = []\n",
    "    data = {}\n",
    "    for line in lines.split('\\n\\n'):\n",
    "        data['text'] = line\n",
    "        words = line.split(\"॥\")\n",
    "        if len(words) > 1:\n",
    "            data['verse_num'] = words[1].strip() if words[1].strip() else ''\n",
    "        else:\n",
    "            data['verse_num'] = ''\n",
    "        if len(data['verse_num'].split('.')) > 1:\n",
    "            chapter_num = data['verse_num'].split('.')[0]\n",
    "            data['chapter_num'] = chapter_num.strip() if chapter_num else ''\n",
    "        datas.append(data.copy())\n",
    "    return datas\n",
    "\n",
    "def index_data(datas, document_name):\n",
    "    for idx, data in enumerate(datas):\n",
    "        data['book'] = document_name\n",
    "        res = es.index(index=index_name, document=data)\n",
    "#         if idx%50 == 0:\n",
    "#             print(f'indexed {idx} docs')\n",
    "    print(f'number of docs in index {index_name} - {es.count(index=index_name)[\"count\"]}')\n",
    "    \n",
    "def delete_data(match_dict):\n",
    "    if not es.indices.exists(index=index_name):\n",
    "        return\n",
    "    query_body = {\n",
    "        \"query\": {\n",
    "            \"match\": match_dict\n",
    "        }\n",
    "    }\n",
    "    res = es.delete_by_query(index=index_name, body=query_body)\n",
    "    return res\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[PosixPath('/Users/achinta/github/gita-anthology/anugita.txt'), PosixPath('/Users/achinta/github/gita-anthology/bhagavadgita.txt')]\n",
      "number of docs in index gita - 1772\n",
      "number of docs in index gita - 1769\n"
     ]
    }
   ],
   "source": [
    "docs_dir = Path.home()/'github/gita-anthology'\n",
    "files = [x for x in docs_dir.glob('*.txt') if x.is_file()]\n",
    "print(files)\n",
    "\n",
    "# parse file\n",
    "es.indices.create(index=index_name, ignore=400)\n",
    "for file in files:\n",
    "    datas = parse_file(file)\n",
    "\n",
    "    # delete existing data for file\n",
    "    delete_data({\"book\": file.stem})\n",
    "\n",
    "    # index new data\n",
    "    index_data(datas, file.stem)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count number of records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1772"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.count(index=index_name)[\"count\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_index': 'gita', '_type': '_doc', '_id': '1WZ7i30BPKOFx3gmSAqb', '_score': 6.5367517, '_source': {'text': 'yogasthaḥ kuru karmāṇi saṅgaṃ tyaktvā dhanañjaya ।\\nsid\\u200cdhyasid\\u200cdhyoḥ samo bhūtvā samatvaṃ yoga ucyate ॥2.48॥', 'verse_num': '2.48', 'chapter_num': '2', 'book': 'bhagavadgita'}}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/achinta/miniconda3/envs/explorer/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "query_body = {\n",
    "  \"query\": {\n",
    "      \"match\": {\n",
    "          \"text\": \"yoga\"\n",
    "      }\n",
    "  }\n",
    "}\n",
    "res = es.search(index=index_name, body=query_body)\n",
    "ids = [r['_id'] for r in res['hits']['hits']]\n",
    "# print(f\"Got Hits: {len(res['hits']['hits'])}\")\n",
    "for doc in res['hits']['hits']:\n",
    "    print(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Purpose\n",
	"We parse a txt file of Bhagavadgita slokas and index in elastic search"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"from elasticsearch import Elasticsearch\n",
	"from pathlib import Path\n",
	"import json\n",
	"es = Elasticsearch(hosts=['65.1.165.22'])\n",
	"index_name = 'gita'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Helper Methods"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [],
	"source": [
	"def parse_file(file: Path):\n",
	" # read file contents\n",
	" with open(file.as_posix(),'r') as f:\n",
	" lines = f.read()\n",
	" \n",
	" # parse each file\n",
	" datas = []\n",
	" data = {}\n",
	" for line in lines.split('\\n\\n'):\n",
	" data['text'] = line\n",
	" words = line.split(\"॥\")\n",
	" if len(words) > 1:\n",
	" data['verse_num'] = words[1].strip() if words[1].strip() else ''\n",
	" else:\n",
	" data['verse_num'] = ''\n",
	" if len(data['verse_num'].split('.')) > 1:\n",
	" chapter_num = data['verse_num'].split('.')[0]\n",
	" data['chapter_num'] = chapter_num.strip() if chapter_num else ''\n",
	" datas.append(data.copy())\n",
	" return datas\n",
	"\n",
	"def index_data(datas, document_name):\n",
	" for idx, data in enumerate(datas):\n",
	" data['book'] = document_name\n",
	" res = es.index(index=index_name, document=data)\n",
	"# if idx%50 == 0:\n",
	"# print(f'indexed {idx} docs')\n",
	" print(f'number of docs in index {index_name} - {es.count(index=index_name)[\"count\"]}')\n",
	" \n",
	"def delete_data(match_dict):\n",
	" if not es.indices.exists(index=index_name):\n",
	" return\n",
	" query_body = {\n",
	" \"query\": {\n",
	" \"match\": match_dict\n",
	" }\n",
	" }\n",
	" res = es.delete_by_query(index=index_name, body=query_body)\n",
	" return res\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[PosixPath('/Users/achinta/github/gita-anthology/anugita.txt'), PosixPath('/Users/achinta/github/gita-anthology/bhagavadgita.txt')]\n",
	"number of docs in index gita - 1772\n",
	"number of docs in index gita - 1769\n"
	]
	}
	],
	"source": [
	"docs_dir = Path.home()/'github/gita-anthology'\n",
	"files = [x for x in docs_dir.glob('*.txt') if x.is_file()]\n",
	"print(files)\n",
	"\n",
	"# parse file\n",
	"es.indices.create(index=index_name, ignore=400)\n",
	"for file in files:\n",
	" datas = parse_file(file)\n",
	"\n",
	" # delete existing data for file\n",
	" delete_data({\"book\": file.stem})\n",
	"\n",
	" # index new data\n",
	" index_data(datas, file.stem)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Count number of records"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1772"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"es.count(index=index_name)[\"count\"]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Search"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'_index': 'gita', '_type': '_doc', '_id': '1WZ7i30BPKOFx3gmSAqb', '_score': 6.5367517, '_source': {'text': 'yogasthaḥ kuru karmāṇi saṅgaṃ tyaktvā dhanañjaya ।\\nsid\\u200cdhyasid\\u200cdhyoḥ samo bhūtvā samatvaṃ yoga ucyate ॥2.48॥', 'verse_num': '2.48', 'chapter_num': '2', 'book': 'bhagavadgita'}}\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Users/achinta/miniconda3/envs/explorer/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: The 'body' parameter is deprecated for the 'search' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information\n",
	" \n"
	]
	}
	],
	"source": [
	"query_body = {\n",
	" \"query\": {\n",
	" \"match\": {\n",
	" \"text\": \"yoga\"\n",
	" }\n",
	" }\n",
	"}\n",
	"res = es.search(index=index_name, body=query_body)\n",
	"ids = [r['_id'] for r in res['hits']['hits']]\n",
	"# print(f\"Got Hits: {len(res['hits']['hits'])}\")\n",
	"for doc in res['hits']['hits']:\n",
	" print(doc)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}