Last active
April 18, 2019 22:44
-
-
Save hobbes3/ad162289be8e6368efd94f751eabb212 to your computer and use it in GitHub Desktop.
irs 990 add oneshot onboard multithread multithreading concurrent splunk4good nom on
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# hobbes3 | |
# A way to handle indexing 2+ million XML files in a single directory (synced from a S3 bucket) | |
import glob | |
import time | |
import logging | |
import logging.handlers | |
import subprocess | |
import os | |
from multiprocessing.dummy import Pool as ThreadPool | |
start_time = time.time() | |
THREADS = 16 | |
FILES = "/home/splunk/irs_990/data/*.xml" | |
FNULL = open(os.devnull, 'w') | |
logger = logging.getLogger('logger_debug') | |
logger.setLevel(logging.DEBUG) | |
ch = logging.StreamHandler() | |
ch.setFormatter(logging.Formatter("[%(levelname)s] (%(threadName)-10s) %(message)s")) | |
logger.addHandler(ch) | |
forms = glob.glob(FILES) | |
def nom_on(form): | |
logger.info("Indexing %s" % form) | |
command = "/opt/splunk/bin/splunk nom on %s -index irs_990 -sourcetype irs_990" % form | |
process = subprocess.Popen(command.split(), stdout=FNULL, stderr=subprocess.STDOUT) | |
process.wait() | |
# http://stackoverflow.com/a/28463266/1150923 | |
pool = ThreadPool(THREADS) | |
results = pool.map(nom_on, forms) | |
pool.close() | |
pool.join() | |
print "--- %s seconds ---" % (time.time() - start_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment