Skip to content

Instantly share code, notes, and snippets.

@mbonaci
Created April 12, 2016 19:52
Show Gist options
  • Save mbonaci/1de414db1d0a783feb36d2f07e37f80d to your computer and use it in GitHub Desktop.
Save mbonaci/1de414db1d0a783feb36d2f07e37f80d to your computer and use it in GitHub Desktop.
Elasticsearch export by time range using Python scroll API
#!/usr/bin/env python
# encoding: utf-8
'''
Elasticsearch export using Python ES client's scroll feature
@author: mbonaci
@copyright: 2016 Sematext. All rights reserved.
@license: Apache 2.0
@contact: [email protected]
'''
import sys
import os
import json
from optparse import OptionParser
from elasticsearch import Elasticsearch
__all__ = []
__version__ = 0.1
__date__ = '2016-04-07'
__updated__ = '2016-04-07'
DEBUG = 0
TESTRUN = 0
PROFILE = 0
def main(argv=None):
'''Command line options.'''
program_name = os.path.basename(sys.argv[0])
program_version = "v0.1"
program_build_date = "%s" % __updated__
program_version_string = '%%prog %s (%s)' % (program_version, program_build_date)
program_longdesc = '''Elasticsearch export using Python ES client's scroll feature'''
program_license = "Copyright 2016 mbonaci (Sematext) \
Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0"
if argv is None:
argv = sys.argv[1:]
try:
# setup option parser
parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license)
parser.add_option("--host", dest="host", help="set ES host name [default: %default]")
parser.add_option("-p", "--port", dest="port", help="set ES port [default: %default]")
parser.add_option("-t", "--token", dest="token", help="set Logsene app token")
parser.add_option("-s", "--start", dest="start", help="set starting timestamp (format: yyyy-MM-ddTHH:mm:ss[Z+03:00])")
parser.add_option("-e", "--end", dest="end", help="set ending timestamp (format: yyyy-MM-ddTHH:mm:ss[Z+03:00])")
parser.add_option("-o", "--out", dest="outfile", help="set output path [default: %default]", metavar="FILE")
parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]")
# set defaults
parser.set_defaults(outfile="./out.txt", verbose=1, host="es.api.hostname.or.ip.address", port=443)
# process options
(opts, args) = parser.parse_args(argv)
print args
if opts.host:
print("host = %s" % opts.host)
if opts.port:
print("port = %s" % opts.port)
if opts.token:
print("token = %s" % opts.token)
if opts.start:
print("start = %s" % opts.start)
if opts.end:
print("end = %s" % opts.end)
if opts.outfile:
print("outfile = %s" % opts.outfile)
if opts.verbose > 0:
print("verbosity level = %d" % opts.verbose)
# ES init #
es = Elasticsearch([
{'host': opts.host, 'port': opts.port, 'use_ssl': True}
])
# Initialize the scroll
page = es.search(
index = opts.token + '_2*',
#doc_type = 'yourType',
scroll = '1m',
size = 200,
# sort = '_doc',
body = {
"query": {
"bool": {
"filter": {
"range": {
"@timestamp": {
"gte": opts.start,
"lt": opts.end
}
}
},
"must": {
"match_all": {}
}
}
}
})
sid = page['_scroll_id']
scroll_size = page['hits']['total']
f = open(opts.outfile, 'wb')
json.dump(page['hits']['hits'], f)
# Start scrolling
while (scroll_size > 0):
print "Scrolling..."
page = es.scroll(scroll = '1m', body = {"scroll": "1m", "scroll_id": sid}) #
# Get the number of results that we returned in the last scroll
scroll_size = len(page['hits']['hits'])
print "scroll size: " + str(scroll_size) + "\n"
# Dump the obtained result set into the output file
json.dump(page['hits']['hits'], f)
f.close()
except Exception, e:
indent = len(program_name) * " "
sys.stderr.write(program_name + ": " + repr(e) + "\n")
sys.stderr.write(indent + " for help use --help")
return 2
if __name__ == "__main__":
#if DEBUG:
# sys.argv.append("-h")
if TESTRUN:
import doctest
doctest.testmod()
if PROFILE:
import cProfile
import pstats
profile_filename = '_profile.txt'
cProfile.run('main()', profile_filename)
statsfile = open("profile_stats.txt", "wb")
p = pstats.Stats(profile_filename, stream=statsfile)
stats = p.strip_dirs().sort_stats('cumulative')
stats.print_stats()
statsfile.close()
sys.exit(0)
sys.exit(main())
@mbonaci
Copy link
Author

mbonaci commented Apr 12, 2016

Prereq. is pip install elasticsearch

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment