Last active
October 28, 2020 21:45
-
-
Save vaclavdekanovsky/8867d276227b50244050f58f874075a9 to your computer and use it in GitHub Desktop.
Unzip a group of zip files in a folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.716281Z", | |
| "start_time": "2020-10-28T21:45:03.706272Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import zipfile" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.731240Z", | |
| "start_time": "2020-10-28T21:45:03.718241Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "folder = \"to_process\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.763126Z", | |
| "start_time": "2020-10-28T21:45:03.733200Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['a.xml', 'Test_001_20201027.zip', 'xyz_001_20201029.zip']" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# os.listdir returns a list with all files and folders contained in a folder\n", | |
| "os.listdir(folder)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.779079Z", | |
| "start_time": "2020-10-28T21:45:03.767110Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['Test_001_20201027.zip', 'xyz_001_20201029.zip']" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# to get only .zips let's use the list notation and endswith function\n", | |
| "[f for f in os.listdir(folder) if f.endswith(\".zip\")]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.795061Z", | |
| "start_time": "2020-10-28T21:45:03.781074Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['Test_001_20201027.zip', 'xyz_001_20201029.zip']" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# alternativelly you can use os.path's splitext\n", | |
| "[f for f in os.listdir(folder) if os.path.splitext(f)[1] == \".zip\"]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# break down the list comprehension" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.810998Z", | |
| "start_time": "2020-10-28T21:45:03.797054Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['Test_001_20201027.zip', 'xyz_001_20201029.zip']" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# the list notations is a comprehensive way how to return a list out of a for loop\n", | |
| "\n", | |
| "# initiate an empty list\n", | |
| "output = []\n", | |
| "\n", | |
| "# iterate over the files/subfolders in the folder\n", | |
| "for f in os.listdir(folder):\n", | |
| " \n", | |
| " # if the file/folder ends with a string \".zip\"\n", | |
| " if f.endswith(\".zip\"):\n", | |
| " \n", | |
| " # append it to the output\n", | |
| " output.append(f)\n", | |
| " \n", | |
| "output" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Unzip all zips in the folder" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.842908Z", | |
| "start_time": "2020-10-28T21:45:03.820968Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# iterate over the identified zipfiles and unzip them\n", | |
| "for zip_file in [f for f in os.listdir(folder) if f.endswith(\".zip\")]:\n", | |
| " with zipfile.ZipFile(os.path.join(folder,zip_file), 'r') as zip_ref:\n", | |
| " zip_ref.extractall(\"temp\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "You can wrap the unzipping into a function which would squeeze the final code into a one-liner" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.858896Z", | |
| "start_time": "2020-10-28T21:45:03.845900Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def unzip(folder: str, file: str, folder_to_extract: str) -> list:\n", | |
| " \"\"\"unzips a file in a folder into folder_to_extract\n", | |
| " returns a list of files in the zip archive\"\"\"\n", | |
| " with zipfile.ZipFile(os.path.join(folder,file), 'r') as zip_ref:\n", | |
| " zip_ref.extractall(folder_to_extract)\n", | |
| " return zip_ref.namelist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2020-10-28T21:45:03.874821Z", | |
| "start_time": "2020-10-28T21:45:03.860859Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[['a.xml'], ['a.xml']]" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# applying a function to the output can be squeezed into the list comprehension\n", | |
| "[unzip(folder, f, \"temp\") for f in os.listdir(folder) if f.endswith(\".zip\")]" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.4" | |
| }, | |
| "toc": { | |
| "base_numbering": 1, | |
| "nav_menu": {}, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "skip_h1_title": false, | |
| "title_cell": "Table of Contents", | |
| "title_sidebar": "Contents", | |
| "toc_cell": false, | |
| "toc_position": {}, | |
| "toc_section_display": true, | |
| "toc_window_display": false | |
| }, | |
| "varInspector": { | |
| "cols": { | |
| "lenName": 16, | |
| "lenType": 16, | |
| "lenVar": 40 | |
| }, | |
| "kernels_config": { | |
| "python": { | |
| "delete_cmd_postfix": "", | |
| "delete_cmd_prefix": "del ", | |
| "library": "var_list.py", | |
| "varRefreshCmd": "print(var_dic_list())" | |
| }, | |
| "r": { | |
| "delete_cmd_postfix": ") ", | |
| "delete_cmd_prefix": "rm(", | |
| "library": "var_list.r", | |
| "varRefreshCmd": "cat(var_dic_list()) " | |
| } | |
| }, | |
| "types_to_exclude": [ | |
| "module", | |
| "function", | |
| "builtin_function_or_method", | |
| "instance", | |
| "_Feature" | |
| ], | |
| "window_display": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment