pydemo · June 11, 2020 13:11
diff --git a/hdfs_pq_access.py b/hdfs_pq_access.py
 ##################################################################
 ## Native hdfs access (only on the cluster)
 # conda install -c conda-forge libhdfs3=2.3.0=1 hdfs3 --yes
 import hdfs3
 import pandas as pd

 nameNodeHost = 'hadoopnn1.localdomain'
 nameNodeIPCPort = 8020
 hdfs = hdfs3.HDFileSystem(nameNodeHost, port=nameNodeIPCPort)

 # ls remote folder
 hdfs.ls('/user/hdfs', detail=False)

 # pandas csv using a file descriptor
 with hdfs.open('/user/hdfs/user_stats.log') as f:
    df = pd.read_csv(f, nrows=1000)

 import pyarrow as pa
 with hdfs.open('/user/hdfs/user_loc_data.parquet') as f:
    table = pa.read_parquet(f)

 ##################################################################
 ## PyArrow
 # libhdfs.so path
 import os
 import subprocess
 from subprocess import PIPE
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pandas as pd

 cmd = ["locate", "-l", "1", "libhdfs.so"]
 libhdfsso_path = subprocess\
    .Popen(cmd, stdout=PIPE)\
    .stdout\
    .read()\
    .rstrip()
 os.environ["ARROW_LIBHDFS_DIR"] = os.path.dirname(libhdfsso_path.decode("utf-8") )
    
 # classpath
 cmd = ["/usr/bin/hdfs", "classpath", "--glob"]
 hadoop_cp = subprocess\
    .Popen(cmd, stdout=PIPE)\
    .stdout\
    .read()\
    .rstrip()
 os.environ["CLASSPATH"] = hadoop_cp.decode("utf-8") 

 nameNodeHost = 'hadoopnn1.localdomain'
 nameNodeIPCPort = 8020

 fs = pa.hdfs.connect(nameNodeHost, nameNodeIPCPort, user='hdfs')
 fs.ls('/user/hdfs/')

 # Either use a Python file as with hdfs3 or 
 # define a pyarrow dataset (much faster access)
 dataset = pq.ParquetDataset('/user/hdfs/user_loc_data.parquet', filesystem=fs)
 table = dataset.read(nthreads=10)

 # print some info about the table
 table.num_columns
 table.num_rows
 table.schema

 # Convert the PyArrow Table to Pandas DF
 df = table.to_pandas()

 ##################################################################
 ## Access using Knox Gateway and the HdfsCLI
 # conda install -c conda-forge python-hdfs
 from hdfs import InsecureClient, HdfsError
 import pandas as pd

 nameNodeHost = 'hadoopnn1.localdomain'
 nameNodeHttpPort = 50070
 webhdfsUrl = f'http://{nameNodeHost}:{nameNodeHttpPort}'

 client = InsecureClient(webhdfsUrl, user='hdfs')
 with client.read('/user/hdfs/user_stats.log', encoding = 'utf-8') as f:
    df = pd.read_csv(f, nrows=1000)
	##################################################################
	## Native hdfs access (only on the cluster)
	# conda install -c conda-forge libhdfs3=2.3.0=1 hdfs3 --yes
	import hdfs3
	import pandas as pd

	nameNodeHost = 'hadoopnn1.localdomain'
	nameNodeIPCPort = 8020
	hdfs = hdfs3.HDFileSystem(nameNodeHost, port=nameNodeIPCPort)

	# ls remote folder
	hdfs.ls('/user/hdfs', detail=False)

	# pandas csv using a file descriptor
	with hdfs.open('/user/hdfs/user_stats.log') as f:
	df = pd.read_csv(f, nrows=1000)

	import pyarrow as pa
	with hdfs.open('/user/hdfs/user_loc_data.parquet') as f:
	table = pa.read_parquet(f)

	##################################################################
	## PyArrow
	# libhdfs.so path
	import os
	import subprocess
	from subprocess import PIPE
	import pyarrow as pa
	import pyarrow.parquet as pq
	import pandas as pd

	cmd = ["locate", "-l", "1", "libhdfs.so"]
	libhdfsso_path = subprocess\
	.Popen(cmd, stdout=PIPE)\
	.stdout\
	.read()\
	.rstrip()
	os.environ["ARROW_LIBHDFS_DIR"] = os.path.dirname(libhdfsso_path.decode("utf-8") )

	# classpath
	cmd = ["/usr/bin/hdfs", "classpath", "--glob"]
	hadoop_cp = subprocess\
	.Popen(cmd, stdout=PIPE)\
	.stdout\
	.read()\
	.rstrip()
	os.environ["CLASSPATH"] = hadoop_cp.decode("utf-8")

	nameNodeHost = 'hadoopnn1.localdomain'
	nameNodeIPCPort = 8020

	fs = pa.hdfs.connect(nameNodeHost, nameNodeIPCPort, user='hdfs')
	fs.ls('/user/hdfs/')

	# Either use a Python file as with hdfs3 or
	# define a pyarrow dataset (much faster access)
	dataset = pq.ParquetDataset('/user/hdfs/user_loc_data.parquet', filesystem=fs)
	table = dataset.read(nthreads=10)

	# print some info about the table
	table.num_columns
	table.num_rows
	table.schema

	# Convert the PyArrow Table to Pandas DF
	df = table.to_pandas()

	##################################################################
	## Access using Knox Gateway and the HdfsCLI
	# conda install -c conda-forge python-hdfs
	from hdfs import InsecureClient, HdfsError
	import pandas as pd

	nameNodeHost = 'hadoopnn1.localdomain'
	nameNodeHttpPort = 50070
	webhdfsUrl = f'http://{nameNodeHost}:{nameNodeHttpPort}'

	client = InsecureClient(webhdfsUrl, user='hdfs')
	with client.read('/user/hdfs/user_stats.log', encoding = 'utf-8') as f:
	df = pd.read_csv(f, nrows=1000)