Skip to content

Instantly share code, notes, and snippets.

@kusal1990
Created June 24, 2022 07:50
Show Gist options
  • Save kusal1990/c73e6672bc68db821457bbc4c9689de3 to your computer and use it in GitHub Desktop.
Save kusal1990/c73e6672bc68db821457bbc4c9689de3 to your computer and use it in GitHub Desktop.
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import pandas as pd
import numpy as np
import re
es = Elasticsearch([{'host':'localhost', 'port':9200}])
corpus2 = open('drive/MyDrive/ARC/ARC_Corpus.txt', 'r', encoding='utf-8')
def create_index(es_client):
""" Creates an Elasticsearch index."""
is_created = False
# Index settings
settings = {
"settings": {
"number_of_shards": 2,
"number_of_replicas": 1
},
"mappings": {
"dynamic": "true",
"_source": {
"enabled": "true"
},
"properties": {
"body": {
"type": "text"
}
}
}
}
print('Creating `corpus2` index...')
try:
if es_client.indices.exists('corpus2'):
es_client.indices.delete(index='corpus2', ignore=[404])
es_client.indices.create(index='corpus2', body=settings)
is_created = True
print('index `corpus2` created successfully.')
except Exception as ex:
print(str(ex))
finally:
return is_created
return is_created
from elasticsearch.helpers import bulk
def index_batch(docs):
""" Indexes a batch of documents."""
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = 'corpus2'
request["body"] = doc['body']
requests.append(request)
bulk(es, requests)
def index_data(es_client, data, BATCH_SIZE=100000):
""" Indexs all the rows in data (python questions)."""
docs = []
count = 0
for line in data:
js_object = {}
js_object['body'] = line
docs.append(js_object)
count += 1
if count % BATCH_SIZE == 0:
index_batch(docs)
docs = []
print('Indexed {} documents.'.format(count))
if docs:
index_batch(docs)
print('Indexed {} documents.'.format(count))
es_client.indices.refresh(index='corpus2')
print("Done indexing.")
index_data(es,corpus2.readlines(), BATCH_SIZE=100000)
@kusal1990
Copy link
Author

ok

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment