Last active
August 29, 2015 14:10
-
-
Save r-wheeler/0af3622e9e22b4175d4e to your computer and use it in GitHub Desktop.
Python submit Hive Query
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import pandas as pd | |
import itertools | |
def query_driver(job, fname = False): | |
""" | |
Submits a Hive query on the gateway machine. You will still need to log in | |
enter your kerberos credentials in another ssh window prior to running. | |
Hive query can be in the form of a raw string or the path to a script | |
Args: | |
job (string): Job can be either a raw string query to pass to hive, | |
or a hive script to be submited | |
fname (boolean): if job = path/to/script.hive, set fname to True | |
Returns: | |
---------- | |
DataFrame of the query results. | |
Be careful with how much data your query will pull back as all results | |
are read into memory | |
""" | |
hive_param = "-f" if fname else "-e" | |
logfile = open('logfile','w') | |
proc = subprocess.Popen(["ssh", "hadoop" , "hive" , "%r" % hive_param, "%r" % job], | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE, | |
stderr=logfile ) | |
data = [line.split('\t') for line in proc.stdout] | |
cols = list(itertools.chain.from_iterable(data[:1])) | |
df = pd.DataFrame(data[1:], columns = cols) | |
logfile.close() | |
return df | |
#Examples: | |
#Running script from a file: | |
t_frame = query_driver('/home/nmrw48/test.hive', fname=True) | |
#Running query passed in as param: | |
q = 'set hive.cli.print.header=true;describe table1;' | |
q_frame = query_driver(q) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment