Modder4869 · June 21, 2023 20:55 · Matsko3 · Jun 21, 2023 · Modder4869 · Jun 21, 2023
diff --git a/zip_file_download.py b/zip_file_download.py
 import requests
 import io
 import struct
 import zipfile
 import binascii
 import re
 import sys
 import os
 from tqdm import tqdm
 #copied from https://betterprogramming.pub/how-to-know-zip-content-without-downloading-it-87a5b30be20a
 EOCD_RECORD_SIZE = 22
 ZIP64_EOCD_RECORD_SIZE = 56
 ZIP64_EOCD_LOCATOR_SIZE = 20

 MAX_STANDARD_ZIP_SIZE = 4_294_967_295


 def retrieve_zip_content(url, regex_pattern, output_folder):
    zip_file = get_zip_file(url)
    extract_matching_files(zip_file, regex_pattern, output_folder)


 def get_zip_file(url):
    file_size = get_file_size(url)
    eocd_record = fetch(url, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE)

    if file_size <= MAX_STANDARD_ZIP_SIZE:
        cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
        central_directory = fetch(url, cd_start, cd_size)
        return zipfile.ZipFile(io.BytesIO(central_directory + eocd_record))
    else:
        zip64_eocd_record = fetch(
            url,
            file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
            ZIP64_EOCD_RECORD_SIZE,
        )
        zip64_eocd_locator = fetch(
            url,
            file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
            ZIP64_EOCD_LOCATOR_SIZE,
        )
        cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
        central_directory = fetch(url, cd_start, cd_size)
        return zipfile.ZipFile(
            io.BytesIO(central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record)
        )


 def get_file_size(url):
    response = requests.head(url)
    if response.status_code == 200:
        content_length = response.headers.get("Content-Length")
        return int(content_length)
    else:
        raise ValueError("Error retrieving file size:", response.status_code)


 def fetch(url, start, length):
    end = start + length - 1
    headers = {"Range": f"bytes={start}-{end}"}
    response = requests.get(url, headers=headers)
    return response.content


 def get_central_directory_metadata_from_eocd(eocd):
    cd_size = parse_little_endian_to_int(eocd[12:16])
    cd_start = parse_little_endian_to_int(eocd[16:20])
    return cd_start, cd_size


 def get_central_directory_metadata_from_eocd64(eocd64):
    cd_size = parse_little_endian_to_int(eocd64[40:48])
    cd_start = parse_little_endian_to_int(eocd64[48:56])
    return cd_start, cd_size


 def parse_little_endian_to_int(little_endian_bytes):
    format_character = "i" if len(little_endian_bytes) == 4 else "q"
    return struct.unpack("<" + format_character, little_endian_bytes)[0]


 def extract_matching_files(zip_file, regex_pattern, output_folder):
    central_directory = zip_file.filelist
    idk = central_directory[0].header_offset  # first entry

    for zi in central_directory:
        local_header_offset = zi.header_offset - idk
        # print(zi.filename)
        if re.search(regex_pattern, zi.filename, re.IGNORECASE):
            file_name = zi.filename.split("/")[-1]
            print(f"downloading {file_name}")
            output_path = f"{output_folder}/{file_name}.zip"
            os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Create parent directories if they don't exist
            with open(output_path, "wb") as f:
                size = struct.unpack("<h", zi.FileHeader()[26:28])[0] + 28 + zi.compress_size + 2

                # Download with progress bar and speed indicator
                headers = {"Range": f"bytes={local_header_offset}-{local_header_offset + size - 1}"}
                response = requests.get(zipUrl, stream=True, headers=headers)
                total_length = int(size)
                progress_bar = tqdm(total=total_length, unit="B", unit_scale=True, unit_divisor=1024, ncols=80)

                for data in response.iter_content(chunk_size=8192):
                    progress_bar.update(len(data))
                    f.write(data)

                progress_bar.close()
                print(f"Extracted: {output_path}")

    zip_file.close()
 if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python script.py <zip_url> <regex_pattern> <output_folder>")
        sys.exit(1)

    zipUrl = sys.argv[1]
    regexPattern = sys.argv[2]
    outputFolder = sys.argv[3]
    retrieve_zip_content(zipUrl, regexPattern, outputFolder)
	import requests
	import io
	import struct
	import zipfile
	import binascii
	import re
	import sys
	import os
	from tqdm import tqdm
	#copied from https://betterprogramming.pub/how-to-know-zip-content-without-downloading-it-87a5b30be20a
	EOCD_RECORD_SIZE = 22
	ZIP64_EOCD_RECORD_SIZE = 56
	ZIP64_EOCD_LOCATOR_SIZE = 20

	MAX_STANDARD_ZIP_SIZE = 4_294_967_295


	def retrieve_zip_content(url, regex_pattern, output_folder):
	zip_file = get_zip_file(url)
	extract_matching_files(zip_file, regex_pattern, output_folder)


	def get_zip_file(url):
	file_size = get_file_size(url)
	eocd_record = fetch(url, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE)

	if file_size <= MAX_STANDARD_ZIP_SIZE:
	cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
	central_directory = fetch(url, cd_start, cd_size)
	return zipfile.ZipFile(io.BytesIO(central_directory + eocd_record))
	else:
	zip64_eocd_record = fetch(
	url,
	file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
	ZIP64_EOCD_RECORD_SIZE,
	)
	zip64_eocd_locator = fetch(
	url,
	file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
	ZIP64_EOCD_LOCATOR_SIZE,
	)
	cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
	central_directory = fetch(url, cd_start, cd_size)
	return zipfile.ZipFile(
	io.BytesIO(central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record)
	)


	def get_file_size(url):
	response = requests.head(url)
	if response.status_code == 200:
	content_length = response.headers.get("Content-Length")
	return int(content_length)
	else:
	raise ValueError("Error retrieving file size:", response.status_code)


	def fetch(url, start, length):
	end = start + length - 1
	headers = {"Range": f"bytes={start}-{end}"}
	response = requests.get(url, headers=headers)
	return response.content


	def get_central_directory_metadata_from_eocd(eocd):
	cd_size = parse_little_endian_to_int(eocd[12:16])
	cd_start = parse_little_endian_to_int(eocd[16:20])
	return cd_start, cd_size


	def get_central_directory_metadata_from_eocd64(eocd64):
	cd_size = parse_little_endian_to_int(eocd64[40:48])
	cd_start = parse_little_endian_to_int(eocd64[48:56])
	return cd_start, cd_size


	def parse_little_endian_to_int(little_endian_bytes):
	format_character = "i" if len(little_endian_bytes) == 4 else "q"
	return struct.unpack("<" + format_character, little_endian_bytes)[0]


	def extract_matching_files(zip_file, regex_pattern, output_folder):
	central_directory = zip_file.filelist
	idk = central_directory[0].header_offset # first entry

	for zi in central_directory:
	local_header_offset = zi.header_offset - idk
	# print(zi.filename)
	if re.search(regex_pattern, zi.filename, re.IGNORECASE):
	file_name = zi.filename.split("/")[-1]
	print(f"downloading {file_name}")
	output_path = f"{output_folder}/{file_name}.zip"
	os.makedirs(os.path.dirname(output_path), exist_ok=True) # Create parent directories if they don't exist
	with open(output_path, "wb") as f:
	size = struct.unpack("<h", zi.FileHeader()[26:28])[0] + 28 + zi.compress_size + 2

	# Download with progress bar and speed indicator
	headers = {"Range": f"bytes={local_header_offset}-{local_header_offset + size - 1}"}
	response = requests.get(zipUrl, stream=True, headers=headers)
	total_length = int(size)
	progress_bar = tqdm(total=total_length, unit="B", unit_scale=True, unit_divisor=1024, ncols=80)

	for data in response.iter_content(chunk_size=8192):
	progress_bar.update(len(data))
	f.write(data)

	progress_bar.close()
	print(f"Extracted: {output_path}")

	zip_file.close()
	if __name__ == "__main__":
	if len(sys.argv) != 4:
	print("Usage: python script.py <zip_url> <regex_pattern> <output_folder>")
	sys.exit(1)

	zipUrl = sys.argv[1]
	regexPattern = sys.argv[2]
	outputFolder = sys.argv[3]
	retrieve_zip_content(zipUrl, regexPattern, outputFolder)
No results found