gentlecolts · October 27, 2024 06:10
diff --git a/Readme.md b/Readme.md
diff --git a/validate_zips.py b/validate_zips.py
 import os
 import zipfile
 from pathlib import Path
 import re
 from multiprocessing import Pool

 # EDIT THESE PATHS
 # root of quixel zips 
 ROOT_DIR=Path("./Quixel Zips").resolve()

 # dir where bad files will be moved to, can be left unchanged
 TRASH_DIR=Path(str(ROOT_DIR)+"_trash")

 # location of cache.txt
 CACHEFILE=Path("cache.txt").resolve()

 # validation file generated by this script
 VALIDATION_RECORD=Path("validated.txt").resolve()
 ###

 def zip_is_valid(fname):
 	try:
 		test_zip=zipfile.ZipFile(fname)

 		if test_zip.testzip() is not None:
 			print(f"'{fname}' is corrupt")
 			return False
 	except Exception as ex:
 		print(f"Exception opening '{fname}':",ex)
 		return False

 	return True

 def to_trash(fname):
 	orig=Path(fname).resolve()
 	relative_path=orig.relative_to(ROOT_DIR)
 	destination=TRASH_DIR/relative_path

 	destination.parent.mkdir(parents=True,exist_ok=True)

 	#print(f"moving '{orig}' to '{destination}'")
 	orig.rename(destination)

 cache_ids=[]
 with open(CACHEFILE) as f:
 	for line in f:
 		line=line.strip()
 		cache_ids.append(line)

 previously_validated_ids=[]
 if VALIDATION_RECORD.exists():
 	with open(VALIDATION_RECORD) as f:
 		for line in f:
 			line=line.strip()
 			previously_validated_ids.append(line)
 #print(previously_validated_ids)

 files=[]
 zip_count=0
 for fpath in ROOT_DIR.glob("**/*"):
 	if fpath.is_file():
 		fname=str(fpath)
 		files.append(fname)
 		if fname.endswith(".zip"):
 			zip_count+=1

 id_regex=re.compile(r'(?P<id>[a-zA-Z0-9]+)_(?P<resolution>\dK)_(?P<type>\w+)_ms.zip')

 def check_file(fname):
 	global id_regex
 	global previously_validated_ids

 	if not fname.endswith(".zip"):
 		print(f"'{fname}' is not a zip")

 		to_trash(fname)

 		return (None, False)
 	else:
 		match=id_regex.search(fname)

 		if match is None:
 			print(f"'{fname}' did not match regex")
 			return (None, False)

 		asset_id=match["id"]

 		#print(previously_validated_ids)
 		if asset_id in previously_validated_ids:
 			#print(f"asset {asset_id} previously validated")
 			return (asset_id, True)

 		fpath=Path(fname)
 		if fpath.stat().st_size < 4*1024*1024:
 			#print(f"'{fname}' is too small")
 			to_trash(fname)
 			return (asset_id, False)

 		if not zip_is_valid(fname):
 			#print(f"'{fname}' is invalid zip")
 			to_trash(fname)
 			return(asset_id,False)

 		if asset_id not in cache_ids:
 			print(f"'{fname}', id {asset_id}, not in cache")

 		return (asset_id, True)

 #trash non-zip files, files under 4mb, and ones that fail the validation test
 bad_ids=set()
 valid_zip_ids=set()
 total_skipped=0

 i=0
 total_items=len(files)
 report_step=total_items*5.0/100.0

 with Pool(8) as pool:
 	for i, result in enumerate(pool.imap_unordered(check_file,files,chunksize=8)):
 		(asset_id, is_valid)=result

 		if asset_id is None:
 			continue
 		elif is_valid:
 			valid_zip_ids.add(asset_id)

 			if asset_id not in previously_validated_ids:
 				#print(f"recording new validation {asset_id}")
 				with open(VALIDATION_RECORD,'a') as f:
 					#print("recording valid id",asset_id)
 					f.write(asset_id+'\n')
 			else:
 				total_skipped+=1
 		else:
 			bad_ids.add(asset_id)

 		j=i+1
 		if int(j/report_step)>int(i/report_step):
 			print(f"completed {j}/{total_items}")
 		i=j

 print(len(cache_ids),"lines in cache")
 print(zip_count,"zips found")
 print(f"found {len(bad_ids)} bad asset ids")
 print(f"found {len(valid_zip_ids)} good asset ids")
 print(f"was able to skip {total_skipped} assets that had been previously validated")

 append_digit=0
 def bak_name(fpath,index=0):
 	append=".bak"
 	if index>0:
 		append+=str(index)

 	return Path(str(fpath)+append)

 while bak_name(CACHEFILE,append_digit).exists():
 	append_digit+=1

 CACHEFILE.rename(bak_name(CACHEFILE,append_digit))

 with open(CACHEFILE,'w') as f:
 	for item in cache_ids:
 		if item in valid_zip_ids:
 			f.write(item+'\n')
	import os
	import zipfile
	from pathlib import Path
	import re
	from multiprocessing import Pool

	# EDIT THESE PATHS
	# root of quixel zips
	ROOT_DIR=Path("./Quixel Zips").resolve()

	# dir where bad files will be moved to, can be left unchanged
	TRASH_DIR=Path(str(ROOT_DIR)+"_trash")

	# location of cache.txt
	CACHEFILE=Path("cache.txt").resolve()

	# validation file generated by this script
	VALIDATION_RECORD=Path("validated.txt").resolve()
	###

	def zip_is_valid(fname):
	try:
	test_zip=zipfile.ZipFile(fname)

	if test_zip.testzip() is not None:
	print(f"'{fname}' is corrupt")
	return False
	except Exception as ex:
	print(f"Exception opening '{fname}':",ex)
	return False

	return True

	def to_trash(fname):
	orig=Path(fname).resolve()
	relative_path=orig.relative_to(ROOT_DIR)
	destination=TRASH_DIR/relative_path

	destination.parent.mkdir(parents=True,exist_ok=True)

	#print(f"moving '{orig}' to '{destination}'")
	orig.rename(destination)

	cache_ids=[]
	with open(CACHEFILE) as f:
	for line in f:
	line=line.strip()
	cache_ids.append(line)

	previously_validated_ids=[]
	if VALIDATION_RECORD.exists():
	with open(VALIDATION_RECORD) as f:
	for line in f:
	line=line.strip()
	previously_validated_ids.append(line)
	#print(previously_validated_ids)

	files=[]
	zip_count=0
	for fpath in ROOT_DIR.glob("*/"):
	if fpath.is_file():
	fname=str(fpath)
	files.append(fname)
	if fname.endswith(".zip"):
	zip_count+=1

	id_regex=re.compile(r'(?P<id>[a-zA-Z0-9]+)_(?P<resolution>\dK)_(?P<type>\w+)_ms.zip')

	def check_file(fname):
	global id_regex
	global previously_validated_ids

	if not fname.endswith(".zip"):
	print(f"'{fname}' is not a zip")

	to_trash(fname)

	return (None, False)
	else:
	match=id_regex.search(fname)

	if match is None:
	print(f"'{fname}' did not match regex")
	return (None, False)

	asset_id=match["id"]

	#print(previously_validated_ids)
	if asset_id in previously_validated_ids:
	#print(f"asset {asset_id} previously validated")
	return (asset_id, True)

	fpath=Path(fname)
	if fpath.stat().st_size < 410241024:
	#print(f"'{fname}' is too small")
	to_trash(fname)
	return (asset_id, False)

	if not zip_is_valid(fname):
	#print(f"'{fname}' is invalid zip")
	to_trash(fname)
	return(asset_id,False)

	if asset_id not in cache_ids:
	print(f"'{fname}', id {asset_id}, not in cache")

	return (asset_id, True)

	#trash non-zip files, files under 4mb, and ones that fail the validation test
	bad_ids=set()
	valid_zip_ids=set()
	total_skipped=0

	i=0
	total_items=len(files)
	report_step=total_items*5.0/100.0

	with Pool(8) as pool:
	for i, result in enumerate(pool.imap_unordered(check_file,files,chunksize=8)):
	(asset_id, is_valid)=result

	if asset_id is None:
	continue
	elif is_valid:
	valid_zip_ids.add(asset_id)

	if asset_id not in previously_validated_ids:
	#print(f"recording new validation {asset_id}")
	with open(VALIDATION_RECORD,'a') as f:
	#print("recording valid id",asset_id)
	f.write(asset_id+'\n')
	else:
	total_skipped+=1
	else:
	bad_ids.add(asset_id)

	j=i+1
	if int(j/report_step)>int(i/report_step):
	print(f"completed {j}/{total_items}")
	i=j

	print(len(cache_ids),"lines in cache")
	print(zip_count,"zips found")
	print(f"found {len(bad_ids)} bad asset ids")
	print(f"found {len(valid_zip_ids)} good asset ids")
	print(f"was able to skip {total_skipped} assets that had been previously validated")

	append_digit=0
	def bak_name(fpath,index=0):
	append=".bak"
	if index>0:
	append+=str(index)

	return Path(str(fpath)+append)

	while bak_name(CACHEFILE,append_digit).exists():
	append_digit+=1

	CACHEFILE.rename(bak_name(CACHEFILE,append_digit))

	with open(CACHEFILE,'w') as f:
	for item in cache_ids:
	if item in valid_zip_ids:
	f.write(item+'\n')