diff --git a/Unzip/XML/Unzip_Update_Zip_full.ipynb b/Unzip/XML/Unzip_Update_Zip_full.ipynb new file mode 100644 index 0000000..c9880ec --- /dev/null +++ b/Unzip/XML/Unzip_Update_Zip_full.ipynb @@ -0,0 +1,1097 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.611224Z", + "start_time": "2020-11-02T23:03:02.587256Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "from pathlib import Path\n", + "import zipfile\n", + "from xml.etree import ElementTree as ET\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Intro into unzipping\n", + "## With and Without with notation" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.643107Z", + "start_time": "2020-11-02T23:03:02.613187Z" + } + }, + "outputs": [], + "source": [ + "# open a zip file for read\n", + "with zipfile.ZipFile(r\"to_process/Test_001_20201027.zip\", 'r') as zip_ref:\n", + " # unzip its content to the temp folder\n", + " zip_ref.extractall(\"temp\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.659102Z", + "start_time": "2020-11-02T23:03:02.646101Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# using with notation ensure that the file is closed after the processing\n", + "zip_ref" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.675054Z", + "start_time": "2020-11-02T23:03:02.661059Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zip_ref = zipfile.ZipFile(r\"to_process/Test_001_20201027.zip\", 'r')\n", + "a = zip_ref.extractall(\"temp\")\n", + "\n", + "# if you only assign the file to the variable, it remains in the memory after processing\n", + "zip_ref" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.690978Z", + "start_time": "2020-11-02T23:03:02.676021Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# unless you intentionally close it\n", + "zip_ref.close()\n", + "zip_ref" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.706940Z", + "start_time": "2020-11-02T23:03:02.691976Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['a.xml']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# no matter the reference is closed, you can see the content of the zip file:\n", + "zip_ref.namelist()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.722895Z", + "start_time": "2020-11-02T23:03:02.712921Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['efg.txt', 'folder_A/a.xml']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# archive containing a file in the folder and a second file\n", + "zipfile.ZipFile(r\"to_process/Folder_001_20201101.zip\", 'r').namelist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Unzip multiple archives in a folder" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.737879Z", + "start_time": "2020-11-02T23:03:02.726883Z" + } + }, + "outputs": [], + "source": [ + "folder = \"to_process\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.752838Z", + "start_time": "2020-11-02T23:03:02.739847Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Folder_001_20201101.zip',\n", + " 'Test_001_20201027.zip',\n", + " 'twoXMLs_001_20201029.zip']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# os.listdir returns a list with all files and folders contained in a folder\n", + "os.listdir(folder)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.768801Z", + "start_time": "2020-11-02T23:03:02.754808Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Folder_001_20201101.zip',\n", + " 'Test_001_20201027.zip',\n", + " 'twoXMLs_001_20201029.zip']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to get only .zips let's use the list notation and endswith function\n", + "[f for f in os.listdir(folder) if f.endswith(\".zip\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.784761Z", + "start_time": "2020-11-02T23:03:02.769768Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Folder_001_20201101.zip',\n", + " 'Test_001_20201027.zip',\n", + " 'twoXMLs_001_20201029.zip']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# alternativelly you can use os.path's splitext\n", + "[f for f in os.listdir(folder) if os.path.splitext(f)[1] == \".zip\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T22:11:28.156628Z", + "start_time": "2020-11-02T22:11:28.143694Z" + } + }, + "source": [ + "## break down the list comprehension" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.799688Z", + "start_time": "2020-11-02T23:03:02.785726Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Folder_001_20201101.zip',\n", + " 'Test_001_20201027.zip',\n", + " 'twoXMLs_001_20201029.zip']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the list notations is a comprehensive way how to return a list out of a for loop\n", + "\n", + "# initiate an empty list\n", + "output = []\n", + "\n", + "# iterate over the files/subfolders in the folder\n", + "for f in os.listdir(folder):\n", + " \n", + " # if the file/folder ends with a string \".zip\"\n", + " if f.endswith(\".zip\"):\n", + " \n", + " # append it to the output\n", + " output.append(f)\n", + " \n", + "output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unzip all zips in the folder" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.831609Z", + "start_time": "2020-11-02T23:03:02.801682Z" + } + }, + "outputs": [], + "source": [ + "# iterate over the identified zipfiles and unzip them to the temp folder\n", + "for zip_file in [f for f in os.listdir(folder) if f.endswith(\".zip\")]:\n", + " with zipfile.ZipFile(os.path.join(folder,zip_file), 'r') as zip_ref:\n", + " zip_ref.extractall(\"temp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can wrap the unzipping into a function which would squeeze the final code into a one-liner" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.847594Z", + "start_time": "2020-11-02T23:03:02.834597Z" + } + }, + "outputs": [], + "source": [ + "def unzip(folder: str, file: str, folder_to_extract: str) -> list:\n", + " \"\"\"unzips a file in a folder into folder_to_extract\n", + " returns a list of files in the zip archive\"\"\"\n", + " with zipfile.ZipFile(os.path.join(folder,file), 'r') as zip_ref:\n", + " zip_ref.extractall(folder_to_extract)\n", + " return zip_ref.namelist()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.863517Z", + "start_time": "2020-11-02T23:03:02.849555Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[['efg.txt', 'folder_A/a.xml'], ['a.xml'], ['a.xml', 'b.xml']]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# applying a function to the output can be squeezed into the list comprehension\n", + "[unzip(folder, f, \"temp\") for f in os.listdir(folder) if f.endswith(\".zip\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T22:13:51.160353Z", + "start_time": "2020-11-02T22:13:51.149383Z" + } + }, + "source": [ + "# Update unzipped XML" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.879476Z", + "start_time": "2020-11-02T23:03:02.864514Z" + } + }, + "outputs": [], + "source": [ + "folder = \"temp\"\n", + "file = \"a.xml\"\n", + "\n", + "# values to which we update\n", + "new_prefix = \"updated\"\n", + "new_version = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.895431Z", + "start_time": "2020-11-02T23:03:02.881470Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# path to the unzipped xml in the temp folder\n", + "path = os.path.join(folder, file)\n", + "\n", + "# load the xml\n", + "tree = ET.parse(path)\n", + "root = tree.getroot()\n", + "\n", + "# the wrapping tag is the root element of our loaded xml\n", + "root" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.911402Z", + "start_time": "2020-11-02T23:03:02.896429Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'xyz_001_20201029'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# in the find the ... node and show its content (.text)\n", + "id = root.find(\"id\").text\n", + "id" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.927347Z", + "start_time": "2020-11-02T23:03:02.918372Z" + } + }, + "outputs": [], + "source": [ + "# split the id by \"_\" underscore\n", + "split_id = id.split(\"_\")\n", + "\n", + "# update the values\n", + "split_id[0] = new_prefix\n", + "split_id[1] = \"{:03d}\".format(new_version) # prefix with leading zeros up to 3 position if needed\n", + "\n", + "# update the xml content\n", + "root.find(\"id\").text = \"_\".join(split_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.943304Z", + "start_time": "2020-11-02T23:03:02.931337Z" + } + }, + "outputs": [], + "source": [ + "# any variable derived from root contains a reference to the original xml\n", + "# so using tree.write writes the updated contant to the path\n", + "tree.write(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wrap to function" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.958298Z", + "start_time": "2020-11-02T23:03:02.944302Z" + } + }, + "outputs": [], + "source": [ + "def update_id(id: str, new_prefix: str, new_version: int) -> str:\n", + " # split the id by \"_\" underscore\n", + " split_id = id.split(\"_\")\n", + "\n", + " # update the values\n", + " split_id[0] = new_prefix\n", + " split_id[1] = \"{:03d}\".format(new_version) # prefix with leading zeros up to 3 position if needed\n", + " \n", + " return \"_\".join(split_id)\n", + "\n", + "def update_xml(path: str, new_prefix: str, new_version: int) -> None:\n", + " # load the xml\n", + " tree = ET.parse(path)\n", + " root = tree.getroot()\n", + " \n", + " # in the find the ... node and show its content (.text)\n", + " id = root.find(\"id\").text\n", + "\n", + " # update the xml content\n", + " root.find(\"id\").text = update_id(id, new_prefix, new_version)\n", + " \n", + " # any variable derived from root contains a reference to the original xml\n", + " # so using tree.write writes the updated contant to the path\n", + " tree.write(path)\n", + " \n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.973223Z", + "start_time": "2020-11-02T23:03:02.960258Z" + } + }, + "outputs": [], + "source": [ + "update_xml(path, new_prefix, new_version)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Zip to the output folder" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:02.988218Z", + "start_time": "2020-11-02T23:03:02.977215Z" + } + }, + "outputs": [], + "source": [ + "temp_folder = \"temp\"\n", + "output_folder = \"processed\"\n", + "temp_file = \"a.xml\"\n", + "output_file_name= \"updated_003_20201026.zip\"" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.019103Z", + "start_time": "2020-11-02T23:03:03.009132Z" + } + }, + "outputs": [], + "source": [ + "processed_xml = os.path.join(temp_folder, temp_file)\n", + "output_path = os.path.join(output_folder, output_file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.035059Z", + "start_time": "2020-11-02T23:03:03.021098Z" + } + }, + "outputs": [], + "source": [ + "# create the output folder if it doesn't exists\n", + "Path(output_folder).mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.051016Z", + "start_time": "2020-11-02T23:03:03.037054Z" + } + }, + "outputs": [], + "source": [ + "# open archive for writing\n", + "with zipfile.ZipFile(output_path, 'w') as myzip:\n", + " # write our processed xml to it, under it's file name only (not the full path)\n", + " myzip.write(processed_xml, os.path.basename(processed_xml))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.066973Z", + "start_time": "2020-11-02T23:03:03.052013Z" + } + }, + "outputs": [], + "source": [ + "with zipfile.ZipFile(output_path, 'w') as myzip:\n", + " myzip.write(\"temp/a.xml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T22:21:31.028138Z", + "start_time": "2020-11-02T22:21:31.015174Z" + } + }, + "source": [ + "# Delete files in the temp folder" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.082930Z", + "start_time": "2020-11-02T23:03:03.067970Z" + } + }, + "outputs": [], + "source": [ + "# remove files and folders (including content) in the directory \n", + "# https://stackoverflow.com/questions/185936/how-to-delete-the-contents-of-a-folder\n", + "def remove_all(folder):\n", + " \"\"\"Checks content of the folder and if it's a files remove it with os.unlink, \n", + " if it's a folder it's deleted using shutil.rmtree including all content\"\"\"\n", + " for filename in os.listdir(folder):\n", + " file_path = os.path.join(folder, filename)\n", + " try:\n", + " if os.path.isfile(file_path) or os.path.islink(file_path):\n", + " os.unlink(file_path)\n", + " elif os.path.isdir(file_path):\n", + " shutil.rmtree(file_path)\n", + " except Exception as e:\n", + " print('Failed to delete %s. Reason: %s' % (file_path, e))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.098892Z", + "start_time": "2020-11-02T23:03:03.083927Z" + } + }, + "outputs": [], + "source": [ + "remove_all(\"temp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Handle zipped folders\n", + "so far our zip archive contained only the single file. But it can also contain a folder" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.130802Z", + "start_time": "2020-11-02T23:03:03.105909Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['efg.txt', 'folder_A/a.xml']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# open a zip file for read\n", + "archive = \"Folder_001_20201101.zip\"\n", + "unzipped = unzip(\"to_process\", archive, temp_folder)\n", + "unzipped" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.146760Z", + "start_time": "2020-11-02T23:03:03.131812Z" + } + }, + "outputs": [], + "source": [ + "for file in unzipped:\n", + " if file.endswith(\".xml\"):\n", + " path = os.path.join(temp_folder, file)\n", + " update_id(path, new_prefix, new_version)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.162723Z", + "start_time": "2020-11-02T23:03:03.148755Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'updated_003_20201101.zip'" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create a new id based on original archive name\n", + "new_archive_name = update_id(archive, new_prefix, new_version)\n", + "new_archive_name" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.192638Z", + "start_time": "2020-11-02T23:03:03.173692Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "temp\\efg.txt\n", + "temp\\folder_A\\a.xml\n" + ] + } + ], + "source": [ + "# list all files including the subdirectories\n", + "# https://stackoverflow.com/questions/2909975/python-list-directory-subdirectory-and-files\n", + "for path, subdirs, files in os.walk(temp_folder):\n", + " for name in files:\n", + " print(os.path.join(path, name))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.208598Z", + "start_time": "2020-11-02T23:03:03.194632Z" + } + }, + "outputs": [], + "source": [ + "def zipdir(path, output_archive_path):\n", + " with zipfile.ZipFile(output_archive_path, 'w') as ziph:\n", + " # ziph is zipfile handle\n", + " for root, dirs, files in os.walk(path):\n", + " for file in files:\n", + " ziph.write(os.path.join(root, file))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.224555Z", + "start_time": "2020-11-02T23:03:03.209595Z" + } + }, + "outputs": [], + "source": [ + "# unfortunatelly this zips the temp folder as a root directory\n", + "zipdir(temp_folder, os.path.join(output_folder, new_archive_name))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.240513Z", + "start_time": "2020-11-02T23:03:03.225554Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'C:\\\\Users\\\\v.dekanovsky\\\\Python Queries\\\\Medium\\\\Unzip\\\\processed\\\\updated_003_20201101.zip'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to zip only the content of the temp folder, use the shutil.make_archive\n", + "shutil.make_archive(os.path.join(output_folder, new_archive_name.replace(\".zip\",\"\")), 'zip', temp_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Put it all together\n", + "## Unzip, update and zip XML in a folder using python" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.256472Z", + "start_time": "2020-11-02T23:03:03.241510Z" + } + }, + "outputs": [], + "source": [ + "source_folder = \"to_process\"\n", + "temp_folder = \"temp\"\n", + "output_folder = \"processed\"\n", + "\n", + "# values to which we update\n", + "new_prefix = \"updated\"\n", + "new_version = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.272428Z", + "start_time": "2020-11-02T23:03:03.259463Z" + } + }, + "outputs": [], + "source": [ + "# create the output folder if it doesn't exists\n", + "Path(output_folder).mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.288384Z", + "start_time": "2020-11-02T23:03:03.274442Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Folder_001_20201101.zip',\n", + " 'Test_001_20201027.zip',\n", + " 'twoXMLs_001_20201029.zip']" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "archives_to_process = [f for f in os.listdir(source_folder) if f.endswith(\".zip\")]\n", + "archives_to_process" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-02T23:03:03.349223Z", + "start_time": "2020-11-02T23:03:03.291377Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Folder_001_20201101.zip\n", + "efg.txt\n", + "folder_A/a.xml\n", + "Test_001_20201027.zip\n", + "a.xml\n", + "twoXMLs_001_20201029.zip\n", + "a.xml\n", + "b.xml\n" + ] + } + ], + "source": [ + "for archive in archives_to_process:\n", + " print(archive)\n", + " new_archive_name = update_id(archive, new_prefix, new_version)\n", + " \n", + " archive_name_wihtout_extension = os.path.splitext(archive)[0]\n", + " \n", + " # unzip to temp folder\n", + " extracted = unzip(source_folder, archive, temp_folder)\n", + " \n", + " # iterate over the extracted files\n", + " for extracted_file in extracted:\n", + " print(extracted_file)\n", + " # cover the option that id was not found, because we use the id in naming the output zip\n", + " new_id = update_id(archive, new_prefix, new_version)\n", + " \n", + " # if the extracted file is a xml\n", + " if extracted_file.endswith(\".xml\"):\n", + " update_xml(os.path.join(temp_folder, extracted_file), new_prefix, new_version)\n", + " \n", + " # zip again into the new_id.zip\n", + " shutil.make_archive(os.path.join(output_folder, new_archive_name.replace(\".zip\",\"\")), 'zip', temp_folder)\n", + " \n", + " # remove the extracted files, so that they are not packed to the following archives\n", + " remove_all(temp_folder)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Unzip/XML/to_process/Folder_001_20201101.zip b/Unzip/XML/to_process/Folder_001_20201101.zip new file mode 100644 index 0000000..b676791 Binary files /dev/null and b/Unzip/XML/to_process/Folder_001_20201101.zip differ diff --git a/Unzip/XML/to_process/Test_001_20201027.zip b/Unzip/XML/to_process/Test_001_20201027.zip new file mode 100644 index 0000000..85b37bb Binary files /dev/null and b/Unzip/XML/to_process/Test_001_20201027.zip differ diff --git a/Unzip/XML/to_process/twoXMLs_001_20201029.zip b/Unzip/XML/to_process/twoXMLs_001_20201029.zip new file mode 100644 index 0000000..32d00d0 Binary files /dev/null and b/Unzip/XML/to_process/twoXMLs_001_20201029.zip differ