ryanmiville · August 2, 2023 02:06
diff --git a/pyspark_utils.py b/pyspark_utils.py
 from pyspark.sql import SparkSession

 def most_recent_path(path: str, spark: SparkSession) -> str:
 	"""Get the most recent path in a directory using the Hadoop file system.
    
 	In practice, this is used to get the most recent partition in a table.    
 	"""
 	sc = spark.sparkContext

 	URI           = sc._jvm.java.net.URI
 	Path          = sc._jvm.org.apache.hadoop.fs.Path
 	FileSystem    = sc._jvm.org.apache.hadoop.fs.FileSystem
 	Configuration = sc._jvm.org.apache.hadoop.conf.Configuration


 	fs = FileSystem.get(URI(path), Configuration())
 	status = fs.listStatus(Path(f"{path}/"))
 	return status[-1].getPath().toString()
	from pyspark.sql import SparkSession

	def most_recent_path(path: str, spark: SparkSession) -> str:
	"""Get the most recent path in a directory using the Hadoop file system.

	In practice, this is used to get the most recent partition in a table.
	"""
	sc = spark.sparkContext

	URI = sc._jvm.java.net.URI
	Path = sc._jvm.org.apache.hadoop.fs.Path
	FileSystem = sc._jvm.org.apache.hadoop.fs.FileSystem
	Configuration = sc._jvm.org.apache.hadoop.conf.Configuration


	fs = FileSystem.get(URI(path), Configuration())
	status = fs.listStatus(Path(f"{path}/"))
	return status[-1].getPath().toString()