Skip to content

Instantly share code, notes, and snippets.

@alexklibisz
Created May 3, 2018 17:01
Show Gist options
  • Save alexklibisz/f69a084b31408c3898714d446c22e28c to your computer and use it in GitHub Desktop.
Save alexklibisz/f69a084b31408c3898714d446c22e28c to your computer and use it in GitHub Desktop.
elasticsearch index JSON documents
"""Read cleaned twitter statuses from disk and insert them
to local elasticsearch instance.
Json downloads taken from here: http://jmcauley.ucsd.edu/data/amazon/
"""
from tqdm import tqdm
from elasticsearch import Elasticsearch, helpers
from pprint import pprint
from time import time
import json
import pdb
import sys
if __name__ == "__main__":
es = Elasticsearch()
actions = []
data_path = sys.argv[1]
body = json.loads("""{
"mappings": {
"review": {
"properties": {
"reviewerID": {
"type": "keyword"
},
"reviewerName": {
"type": "keyword"
},
"timestamp": {
"type": "date"
},
"asin": {
"type": "keyword"
},
"score": {
"type": "half_float"
},
"reviewText": {
"type": "text"
},
"reviewSummary": {
"type": "text",
"index": false
}
}
}
}
}""")
es.indices.create(index="amazon_reviews", body=body)
for i, line in tqdm(enumerate(open(data_path))):
review = json.loads(line)
actions.append({
"_index": "amazon_reviews",
"_type": "review",
"_source": {
"asin": review["asin"],
"timestamp": review["unixReviewTime"],
"reviewerID": review["reviewerID"],
"reviewerName": review["reviewerName"] if "reviewerName" in review else None,
"reviewSummary": review["summary"],
"reviewText": review["reviewText"],
"score": review["overall"],
}
})
if len(actions) == 100000:
t0 = time()
helpers.bulk(es, actions)
actions = []
print("%d: inserted in %d seconds" % (i, time() - t0))
helpers.bulk(es, actions)
actions = []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment