Created
June 24, 2022 07:50
-
-
Save kusal1990/c73e6672bc68db821457bbc4c9689de3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch | |
from elasticsearch.helpers import parallel_bulk | |
import pandas as pd | |
import numpy as np | |
import re | |
es = Elasticsearch([{'host':'localhost', 'port':9200}]) | |
corpus2 = open('drive/MyDrive/ARC/ARC_Corpus.txt', 'r', encoding='utf-8') | |
def create_index(es_client): | |
""" Creates an Elasticsearch index.""" | |
is_created = False | |
# Index settings | |
settings = { | |
"settings": { | |
"number_of_shards": 2, | |
"number_of_replicas": 1 | |
}, | |
"mappings": { | |
"dynamic": "true", | |
"_source": { | |
"enabled": "true" | |
}, | |
"properties": { | |
"body": { | |
"type": "text" | |
} | |
} | |
} | |
} | |
print('Creating `corpus2` index...') | |
try: | |
if es_client.indices.exists('corpus2'): | |
es_client.indices.delete(index='corpus2', ignore=[404]) | |
es_client.indices.create(index='corpus2', body=settings) | |
is_created = True | |
print('index `corpus2` created successfully.') | |
except Exception as ex: | |
print(str(ex)) | |
finally: | |
return is_created | |
return is_created | |
from elasticsearch.helpers import bulk | |
def index_batch(docs): | |
""" Indexes a batch of documents.""" | |
requests = [] | |
for i, doc in enumerate(docs): | |
request = doc | |
request["_op_type"] = "index" | |
request["_index"] = 'corpus2' | |
request["body"] = doc['body'] | |
requests.append(request) | |
bulk(es, requests) | |
def index_data(es_client, data, BATCH_SIZE=100000): | |
""" Indexs all the rows in data (python questions).""" | |
docs = [] | |
count = 0 | |
for line in data: | |
js_object = {} | |
js_object['body'] = line | |
docs.append(js_object) | |
count += 1 | |
if count % BATCH_SIZE == 0: | |
index_batch(docs) | |
docs = [] | |
print('Indexed {} documents.'.format(count)) | |
if docs: | |
index_batch(docs) | |
print('Indexed {} documents.'.format(count)) | |
es_client.indices.refresh(index='corpus2') | |
print("Done indexing.") | |
index_data(es,corpus2.readlines(), BATCH_SIZE=100000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ok