cupdike · December 12, 2019 17:16 · cupdike · Dec 17, 2019
diff --git a/pyarrowKerberizedHdfsDebugger.py b/pyarrowKerberizedHdfsDebugger.py
 import pyarrow
 import os
 import sh

 # Get obscure error without this:  pyarrow.lib.ArrowIOError: HDFS list directory failed, errno: 2 (No such file or directory)
 os.environ['CLASSPATH'] = str(sh.hadoop('classpath','--glob'))

 # Not needed
 #os.environ['HADOOP_HOME'] = '/opt/cloudera/parcels/CDH-<your version>/'

 DRIVER_PATH='/opt/cloudera/parcels/CDH-<your version>/lib64'
 DRIVER='libhdfs'

 os.environ['ARROW_LIBHDFS_DIR'] =  DRIVER_PATH
 USER='myuser'

 # Not needed
 #LIBJVM_PATH = '/usr/java/jdk1.8.0_121/jre/lib/amd64/server'
 #os.environ['LD_LIBRARY_PATH'] = ':'.join(filter(None, [os.getenv('LD_LIBRARY_PATH'), LIBJVM_PATH, '/opt/cloudera/parcels/CDH-<your version>/lib64/']))

 # Way to test if lib is accessible
 #import ctypes
 #ctypes.CDLL('/'.join([LIBJVM_PATH, 'libjvm.so']))

 # Suggest you do a kinit just to be sure your ticket is GTG
 KERB_TICKET= os.getenv('KRB5CCNAME')
 #KERB_TICKET='/tmp/krb5cc_<specific cache>'

 args = {
    'host': 'default',
    'user': USER,
    'kerb_ticket': KERB_TICKET,
    'port': 8020,
    'driver': DRIVER
 }

 print('ARROW_LIBHDFS_DIR', os.getenv('ARROW_LIBHDFS_DIR'))
 # print('HADOOP_HOME', os.getenv('HADOOP_HOME'))
 # print('LD_LIBRARY_PATH', os.getenv('LD_LIBRARY_PATH'))

 from pprint import pprint
 pprint(args)

 fs = pyarrow.hdfs.connect(**args)

 pprint(fs.ls('/user/myuser'))
	import pyarrow
	import os
	import sh

	# Get obscure error without this: pyarrow.lib.ArrowIOError: HDFS list directory failed, errno: 2 (No such file or directory)
	os.environ['CLASSPATH'] = str(sh.hadoop('classpath','--glob'))

	# Not needed
	#os.environ['HADOOP_HOME'] = '/opt/cloudera/parcels/CDH-<your version>/'

	DRIVER_PATH='/opt/cloudera/parcels/CDH-<your version>/lib64'
	DRIVER='libhdfs'

	os.environ['ARROW_LIBHDFS_DIR'] = DRIVER_PATH
	USER='myuser'

	# Not needed
	#LIBJVM_PATH = '/usr/java/jdk1.8.0_121/jre/lib/amd64/server'
	#os.environ['LD_LIBRARY_PATH'] = ':'.join(filter(None, [os.getenv('LD_LIBRARY_PATH'), LIBJVM_PATH, '/opt/cloudera/parcels/CDH-<your version>/lib64/']))

	# Way to test if lib is accessible
	#import ctypes
	#ctypes.CDLL('/'.join([LIBJVM_PATH, 'libjvm.so']))

	# Suggest you do a kinit just to be sure your ticket is GTG
	KERB_TICKET= os.getenv('KRB5CCNAME')
	#KERB_TICKET='/tmp/krb5cc_<specific cache>'

	args = {
	'host': 'default',
	'user': USER,
	'kerb_ticket': KERB_TICKET,
	'port': 8020,
	'driver': DRIVER
	}

	print('ARROW_LIBHDFS_DIR', os.getenv('ARROW_LIBHDFS_DIR'))
	# print('HADOOP_HOME', os.getenv('HADOOP_HOME'))
	# print('LD_LIBRARY_PATH', os.getenv('LD_LIBRARY_PATH'))

	from pprint import pprint
	pprint(args)

	fs = pyarrow.hdfs.connect(**args)

	pprint(fs.ls('/user/myuser'))