Last active
August 2, 2023 02:06
-
-
Save ryanmiville/792289d9ebe632d76f4cf3b794cac727 to your computer and use it in GitHub Desktop.
pyspark utilities
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
def most_recent_path(path: str, spark: SparkSession) -> str: | |
"""Get the most recent path in a directory using the Hadoop file system. | |
In practice, this is used to get the most recent partition in a table. | |
""" | |
sc = spark.sparkContext | |
URI = sc._jvm.java.net.URI | |
Path = sc._jvm.org.apache.hadoop.fs.Path | |
FileSystem = sc._jvm.org.apache.hadoop.fs.FileSystem | |
Configuration = sc._jvm.org.apache.hadoop.conf.Configuration | |
fs = FileSystem.get(URI(path), Configuration()) | |
status = fs.listStatus(Path(f"{path}/")) | |
return status[-1].getPath().toString() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment