- 
      
- 
        Save mdrakiburrahman/dade03380b8907c85341ece18c795e4e to your computer and use it in GitHub Desktop. 
| def deep_ls(path: str, max_depth=1): | |
| """ | |
| List all files and folders in specified path and | |
| subfolders within maximum recursion depth. | |
| """ | |
| # List all files in path and apply sorting rules | |
| li = mssparkutils.fs.ls(path) | |
| # Return all files | |
| for x in li: | |
| if x.size != 0: | |
| yield x | |
| # If the max_depth has not been reached, start | |
| # listing files and folders in subdirectories | |
| if max_depth > 1: | |
| for x in li: | |
| if x.size != 0: | |
| continue | |
| for y in deep_ls(x.path, max_depth - 1): | |
| yield y | |
| # If max_depth has been reached, | |
| # return the folders | |
| else: | |
| for x in li: | |
| if x.size == 0: | |
| yield x | |
| def convertfiles2df(files): | |
| """ | |
| Converts FileInfo object into Pandas DataFrame to enable display | |
| """ | |
| # Disable Arrow-based transfers since the Pandas DataFrame is tiny | |
| spark.conf.set("spark.sql.execution.arrow.enabled", "false") | |
| schema = ['path','name','size'] | |
| df = pd.DataFrame([[getattr(i,j) for j in schema] for i in files], columns = schema).sort_values('path') | |
| return(df) | |
| # Example Implementation | |
| # ---------------------- | |
| import pandas as pd | |
| from notebookutils import mssparkutils | |
| # Azure storage access info | |
| adls_account_name = 'your-account-name' | |
| adls_container_name = 'your-container-name' | |
| linked_service_name = 'adls-linked-service-name-in-synapse' | |
| # Grab SAS token | |
| adls_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name) | |
| # Configure Spark to access from DFS endpoint | |
| root = 'abfss://%s@%s.dfs.core.windows.net/' % (adls_container_name, adls_account_name) | |
| spark.conf.set('fs.azure.sas.%s.%s.dfs.core.windows.net' % (adls_container_name, adls_account_name), adls_sas_token) | |
| # Get files | |
| files = list(deep_ls(root, max_depth=20)) | |
| # Display with Pretty Printing | |
| display(convertfiles2df(files)) | |
| # Pretty Printing works with default ls as well | |
| display(convertfiles2df(mssparkutils.fs.ls(root))) | 
This is brilliant! Something I've been struggling with.
+1
Glad it helped!
This is perfect. Thank you!
Thank you! This helped.
Are the 'if x.size == 0' lines solely for differentiating between files and folders? If so, each FileInfo object in the list returned by mssparkutils.fs.ls() has a bunch of "hidden" attributes (i.e. not revealed by print(FileInfo)), including 'isDir' (boolean) which indicates whether or not the item is a directory. If it's possible to have files with sizes of zero, then this would be more reliable. Discovered that here.
@mdrakiburrahman I noticed in your article where you linked this code, that it is not a good idea to run recursion on a Production Data Lake with a large number of small files. Do you have a recommendation for an alternative approach? I am trying to accomplish what you code does, but on a large scale. Any help is greatly appreciated!
This is brilliant! Something I've been struggling with.