Skip to content

Instantly share code, notes, and snippets.

@CoffeeVampir3
Created February 14, 2024 16:04
Show Gist options
  • Save CoffeeVampir3/1c5b183a9fb8653b5b5ccf3ab8e32f35 to your computer and use it in GitHub Desktop.
Save CoffeeVampir3/1c5b183a9fb8653b5b5ccf3ab8e32f35 to your computer and use it in GitHub Desktop.
Anti jpeg data collator
import json
def filter_images_by_score_interval(jsonl_file_path, target_interval, output_file_path):
min_score = None
matching_entries = []
# Find the minimum score in the file
with open(jsonl_file_path, 'r') as file:
for line in file:
json_object = json.loads(line)
score = json_object.get('score')
if min_score is None or score < min_score:
min_score = score
# If no minimum score found, do nothing
if min_score is None:
return
# Read the file again to find matching entries
with open(jsonl_file_path, 'r') as file:
for line in file:
json_object = json.loads(line)
score = json_object.get('score')
# Check if the score falls within the target interval
if min_score <= score < min_score + target_interval:
matching_entries.append(json_object)
# Write matching entries to a new JSONL file
with open(output_file_path, 'w') as outfile:
for entry in matching_entries:
json.dump(entry, outfile)
outfile.write('\n')
print(f"Matching entries have been written to {output_file_path}")
if __name__ == "__main__":
jsonl_file_path = '/home/blackroot/Desktop/anti-jpgger/dataset.jsonl' # Replace with your JSONL file path
target_interval = 0.02 # The specific interval you mentioned
output_file_path = 'garbage.jsonl' # Path for the new JSONL file
filter_images_by_score_interval(jsonl_file_path, target_interval, output_file_path)
import os
import sys
from PIL import Image
def convert_png_to_jpeg(input_folder):
# Check if the input folder exists
if not os.path.isdir(input_folder):
print(f"The folder {input_folder} does not exist.")
return
# Ensure the output directory exists
output_dir = "outputs"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Iterate through all files in the input folder
for file in os.listdir(input_folder):
if file.endswith(".png"):
# Construct full file path
file_path = os.path.join(input_folder, file)
# Open the image
with Image.open(file_path) as img:
# Convert PNG to RGB mode if it is in a different mode (JPEG does not support alpha channel)
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
# Save the image with decreasing quality, including the lowest possible setting
for quality in [40, 30, 20, 10, 1]: # Now includes 1 as the lowest quality setting
output_filename = f"{os.path.splitext(file)[0]}_quality{quality}.jpg"
output_path = os.path.join(output_dir, output_filename)
img.save(output_path, "JPEG", quality=quality)
print(f"Saved {output_path} with quality {quality}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <input_folder_path>")
else:
input_folder = sys.argv[1]
convert_png_to_jpeg(input_folder)
import json
import os
import requests
def download_images(jsonl_file_path, output_folder):
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Open and read the JSONL file
with open(jsonl_file_path, 'r') as file:
for line in file:
try:
# Parse the JSON object from the line
json_object = json.loads(line)
url = json_object['url']
score = json_object['score']
# Format the filename as the score
filename = f"{score}.png"
output_path = os.path.join(output_folder, filename)
# Download and save the image
response = requests.get(url)
if response.status_code == 200:
with open(output_path, 'wb') as img_file:
img_file.write(response.content)
print(f"Downloaded and saved image with score {score} to {output_path}")
else:
print(f"Failed to download image with score {score} from {url}")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
jsonl_file_path = '/home/blackroot/Desktop/anti-jpgger/garbage.jsonl' # Replace this with your JSONL file path
output_folder = 'images' # The folder where images will be saved
download_images(jsonl_file_path, output_folder)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment