Created
December 24, 2020 21:16
-
-
Save mdrakiburrahman/dade03380b8907c85341ece18c795e4e to your computer and use it in GitHub Desktop.
Recursively listing Data Lake files with `display` implemented
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def deep_ls(path: str, max_depth=1): | |
""" | |
List all files and folders in specified path and | |
subfolders within maximum recursion depth. | |
""" | |
# List all files in path and apply sorting rules | |
li = mssparkutils.fs.ls(path) | |
# Return all files | |
for x in li: | |
if x.size != 0: | |
yield x | |
# If the max_depth has not been reached, start | |
# listing files and folders in subdirectories | |
if max_depth > 1: | |
for x in li: | |
if x.size != 0: | |
continue | |
for y in deep_ls(x.path, max_depth - 1): | |
yield y | |
# If max_depth has been reached, | |
# return the folders | |
else: | |
for x in li: | |
if x.size == 0: | |
yield x | |
def convertfiles2df(files): | |
""" | |
Converts FileInfo object into Pandas DataFrame to enable display | |
""" | |
# Disable Arrow-based transfers since the Pandas DataFrame is tiny | |
spark.conf.set("spark.sql.execution.arrow.enabled", "false") | |
schema = ['path','name','size'] | |
df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path') | |
return(df) | |
# Example Implementation | |
# ---------------------- | |
import pandas as pd | |
from notebookutils import mssparkutils | |
# Azure storage access info | |
adls_account_name = 'your-account-name' | |
adls_container_name = 'your-container-name' | |
linked_service_name = 'adls-linked-service-name-in-synapse' | |
# Grab SAS token | |
adls_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name) | |
# Configure Spark to access from DFS endpoint | |
root = 'abfss://%s@%s.dfs.core.windows.net/' % (adls_container_name, adls_account_name) | |
spark.conf.set('fs.azure.sas.%s.%s.dfs.core.windows.net' % (adls_container_name, adls_account_name), adls_sas_token) | |
# Get files | |
files = list(deep_ls(root, max_depth=20)) | |
# Display with Pretty Printing | |
display(convertfiles2df(files)) | |
# Pretty Printing works with default ls as well | |
display(convertfiles2df(mssparkutils.fs.ls(root))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@mdrakiburrahman I noticed in your article where you linked this code, that it is not a good idea to run recursion on a Production Data Lake with a large number of small files. Do you have a recommendation for an alternative approach? I am trying to accomplish what you code does, but on a large scale. Any help is greatly appreciated!