Skip to content

Instantly share code, notes, and snippets.

@gentlecolts
Last active October 27, 2024 06:10
Show Gist options
  • Save gentlecolts/cb5f4476842a116b1ceb910a1efc921f to your computer and use it in GitHub Desktop.
Save gentlecolts/cb5f4476842a116b1ceb910a1efc921f to your computer and use it in GitHub Desktop.
Quixel Backup Validator
import os
import zipfile
from pathlib import Path
import re
from multiprocessing import Pool
# EDIT THESE PATHS
# root of quixel zips
ROOT_DIR=Path("./Quixel Zips").resolve()
# dir where bad files will be moved to, can be left unchanged
TRASH_DIR=Path(str(ROOT_DIR)+"_trash")
# location of cache.txt
CACHEFILE=Path("cache.txt").resolve()
# validation file generated by this script
VALIDATION_RECORD=Path("validated.txt").resolve()
###
def zip_is_valid(fname):
try:
test_zip=zipfile.ZipFile(fname)
if test_zip.testzip() is not None:
print(f"'{fname}' is corrupt")
return False
except Exception as ex:
print(f"Exception opening '{fname}':",ex)
return False
return True
def to_trash(fname):
orig=Path(fname).resolve()
relative_path=orig.relative_to(ROOT_DIR)
destination=TRASH_DIR/relative_path
destination.parent.mkdir(parents=True,exist_ok=True)
#print(f"moving '{orig}' to '{destination}'")
orig.rename(destination)
cache_ids=[]
with open(CACHEFILE) as f:
for line in f:
line=line.strip()
cache_ids.append(line)
previously_validated_ids=[]
if VALIDATION_RECORD.exists():
with open(VALIDATION_RECORD) as f:
for line in f:
line=line.strip()
previously_validated_ids.append(line)
#print(previously_validated_ids)
files=[]
zip_count=0
for fpath in ROOT_DIR.glob("**/*"):
if fpath.is_file():
fname=str(fpath)
files.append(fname)
if fname.endswith(".zip"):
zip_count+=1
id_regex=re.compile(r'(?P<id>[a-zA-Z0-9]+)_(?P<resolution>\dK)_(?P<type>\w+)_ms.zip')
def check_file(fname):
global id_regex
global previously_validated_ids
if not fname.endswith(".zip"):
print(f"'{fname}' is not a zip")
to_trash(fname)
return (None, False)
else:
match=id_regex.search(fname)
if match is None:
print(f"'{fname}' did not match regex")
return (None, False)
asset_id=match["id"]
#print(previously_validated_ids)
if asset_id in previously_validated_ids:
#print(f"asset {asset_id} previously validated")
return (asset_id, True)
fpath=Path(fname)
if fpath.stat().st_size < 4*1024*1024:
#print(f"'{fname}' is too small")
to_trash(fname)
return (asset_id, False)
if not zip_is_valid(fname):
#print(f"'{fname}' is invalid zip")
to_trash(fname)
return(asset_id,False)
if asset_id not in cache_ids:
print(f"'{fname}', id {asset_id}, not in cache")
return (asset_id, True)
#trash non-zip files, files under 4mb, and ones that fail the validation test
bad_ids=set()
valid_zip_ids=set()
total_skipped=0
i=0
total_items=len(files)
report_step=total_items*5.0/100.0
with Pool(8) as pool:
for i, result in enumerate(pool.imap_unordered(check_file,files,chunksize=8)):
(asset_id, is_valid)=result
if asset_id is None:
continue
elif is_valid:
valid_zip_ids.add(asset_id)
if asset_id not in previously_validated_ids:
#print(f"recording new validation {asset_id}")
with open(VALIDATION_RECORD,'a') as f:
#print("recording valid id",asset_id)
f.write(asset_id+'\n')
else:
total_skipped+=1
else:
bad_ids.add(asset_id)
j=i+1
if int(j/report_step)>int(i/report_step):
print(f"completed {j}/{total_items}")
i=j
print(len(cache_ids),"lines in cache")
print(zip_count,"zips found")
print(f"found {len(bad_ids)} bad asset ids")
print(f"found {len(valid_zip_ids)} good asset ids")
print(f"was able to skip {total_skipped} assets that had been previously validated")
append_digit=0
def bak_name(fpath,index=0):
append=".bak"
if index>0:
append+=str(index)
return Path(str(fpath)+append)
while bak_name(CACHEFILE,append_digit).exists():
append_digit+=1
CACHEFILE.rename(bak_name(CACHEFILE,append_digit))
with open(CACHEFILE,'w') as f:
for item in cache_ids:
if item in valid_zip_ids:
f.write(item+'\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment