See https://gist.github.com/maalrron/877b2edb23cc5d99d6a6b4c22f708e58 for more context
Run like python dlMegascans.py && python validate_zips.py
See https://gist.github.com/maalrron/877b2edb23cc5d99d6a6b4c22f708e58 for more context
Run like python dlMegascans.py && python validate_zips.py
import os | |
import zipfile | |
from pathlib import Path | |
import re | |
from multiprocessing import Pool | |
# EDIT THESE PATHS | |
# root of quixel zips | |
ROOT_DIR=Path("./Quixel Zips").resolve() | |
# dir where bad files will be moved to, can be left unchanged | |
TRASH_DIR=Path(str(ROOT_DIR)+"_trash") | |
# location of cache.txt | |
CACHEFILE=Path("cache.txt").resolve() | |
# validation file generated by this script | |
VALIDATION_RECORD=Path("validated.txt").resolve() | |
### | |
def zip_is_valid(fname): | |
try: | |
test_zip=zipfile.ZipFile(fname) | |
if test_zip.testzip() is not None: | |
print(f"'{fname}' is corrupt") | |
return False | |
except Exception as ex: | |
print(f"Exception opening '{fname}':",ex) | |
return False | |
return True | |
def to_trash(fname): | |
orig=Path(fname).resolve() | |
relative_path=orig.relative_to(ROOT_DIR) | |
destination=TRASH_DIR/relative_path | |
destination.parent.mkdir(parents=True,exist_ok=True) | |
#print(f"moving '{orig}' to '{destination}'") | |
orig.rename(destination) | |
cache_ids=[] | |
with open(CACHEFILE) as f: | |
for line in f: | |
line=line.strip() | |
cache_ids.append(line) | |
previously_validated_ids=[] | |
if VALIDATION_RECORD.exists(): | |
with open(VALIDATION_RECORD) as f: | |
for line in f: | |
line=line.strip() | |
previously_validated_ids.append(line) | |
#print(previously_validated_ids) | |
files=[] | |
zip_count=0 | |
for fpath in ROOT_DIR.glob("**/*"): | |
if fpath.is_file(): | |
fname=str(fpath) | |
files.append(fname) | |
if fname.endswith(".zip"): | |
zip_count+=1 | |
id_regex=re.compile(r'(?P<id>[a-zA-Z0-9]+)_(?P<resolution>\dK)_(?P<type>\w+)_ms.zip') | |
def check_file(fname): | |
global id_regex | |
global previously_validated_ids | |
if not fname.endswith(".zip"): | |
print(f"'{fname}' is not a zip") | |
to_trash(fname) | |
return (None, False) | |
else: | |
match=id_regex.search(fname) | |
if match is None: | |
print(f"'{fname}' did not match regex") | |
return (None, False) | |
asset_id=match["id"] | |
#print(previously_validated_ids) | |
if asset_id in previously_validated_ids: | |
#print(f"asset {asset_id} previously validated") | |
return (asset_id, True) | |
fpath=Path(fname) | |
if fpath.stat().st_size < 4*1024*1024: | |
#print(f"'{fname}' is too small") | |
to_trash(fname) | |
return (asset_id, False) | |
if not zip_is_valid(fname): | |
#print(f"'{fname}' is invalid zip") | |
to_trash(fname) | |
return(asset_id,False) | |
if asset_id not in cache_ids: | |
print(f"'{fname}', id {asset_id}, not in cache") | |
return (asset_id, True) | |
#trash non-zip files, files under 4mb, and ones that fail the validation test | |
bad_ids=set() | |
valid_zip_ids=set() | |
total_skipped=0 | |
i=0 | |
total_items=len(files) | |
report_step=total_items*5.0/100.0 | |
with Pool(8) as pool: | |
for i, result in enumerate(pool.imap_unordered(check_file,files,chunksize=8)): | |
(asset_id, is_valid)=result | |
if asset_id is None: | |
continue | |
elif is_valid: | |
valid_zip_ids.add(asset_id) | |
if asset_id not in previously_validated_ids: | |
#print(f"recording new validation {asset_id}") | |
with open(VALIDATION_RECORD,'a') as f: | |
#print("recording valid id",asset_id) | |
f.write(asset_id+'\n') | |
else: | |
total_skipped+=1 | |
else: | |
bad_ids.add(asset_id) | |
j=i+1 | |
if int(j/report_step)>int(i/report_step): | |
print(f"completed {j}/{total_items}") | |
i=j | |
print(len(cache_ids),"lines in cache") | |
print(zip_count,"zips found") | |
print(f"found {len(bad_ids)} bad asset ids") | |
print(f"found {len(valid_zip_ids)} good asset ids") | |
print(f"was able to skip {total_skipped} assets that had been previously validated") | |
append_digit=0 | |
def bak_name(fpath,index=0): | |
append=".bak" | |
if index>0: | |
append+=str(index) | |
return Path(str(fpath)+append) | |
while bak_name(CACHEFILE,append_digit).exists(): | |
append_digit+=1 | |
CACHEFILE.rename(bak_name(CACHEFILE,append_digit)) | |
with open(CACHEFILE,'w') as f: | |
for item in cache_ids: | |
if item in valid_zip_ids: | |
f.write(item+'\n') |