Skip to content

Instantly share code, notes, and snippets.

@arnesacnussem
Created July 4, 2023 15:02
Show Gist options
  • Save arnesacnussem/0cdf79f16c4f3f08bbfcce00731521bd to your computer and use it in GitHub Desktop.
Save arnesacnussem/0cdf79f16c4f3f08bbfcce00731521bd to your computer and use it in GitHub Desktop.
A simple python that use cv2 to detect and crop AD in cbz file
#!python 3.10
import argparse
from scipy.spatial import distance_matrix
from matplotlib import pyplot as plt
import cv2
import numpy as np
import zipfile
import os
from tqdm import tqdm
from PIL import Image
from pathlib import Path
import shutil
def crop_blank_space(img, threshold=10):
# Invert the image
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# To invert the text to white
gray = 255*(gray < 255 - threshold).astype(np.uint8)
gray = cv2.morphologyEx(gray, cv2.MORPH_OPEN, np.ones(
(2, 2), dtype=np.uint8)) # Perform noise filtering
coords = cv2.findNonZero(gray) # Find all non-zero points (text)
x, y, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
cropped_image = img[y:y+h, x:x+w]
return cropped_image
def merge_points(points, threshold):
# Calculate pairwise distance matrix
dist_matrix = distance_matrix(points, points)
# Initialize a boolean matrix to keep track of merged points
merged = np.zeros(len(points), dtype=bool)
# List to store the merged point groups
merged_points = []
# Iterate over each point
for i in range(len(points)):
# Skip the point if it has already been merged
if merged[i]:
continue
# Iterate over the remaining unmerged points
for j in range(i+1, len(points)):
# If the distance between two points is less than the threshold
if dist_matrix[i, j] < threshold:
merged[j] = True
# Add the group to the merged points list
merged_points.append(points[i])
# Return the merged point groups
return merged_points
def find_template(template, target, threshold=0.8, draw=False):
if np.shape(template)[0] >= np.shape(target)[0]:
return []
# Perform template matching
result = cv2.matchTemplate(target, template, cv2.TM_CCOEFF_NORMED)
# Find locations with similarity above the threshold
locations = np.where(result >= threshold)
if np.shape(locations)[1] == 0:
return []
merged_points = merge_points(list(zip(*locations[::-1])), threshold=10)
matches = []
template_height, template_width = template.shape[:2]
for pt in merged_points:
top_left = pt
bottom_right = (pt[0] + template_width, pt[1] + template_height)
matches.append((top_left, bottom_right))
if draw:
for (top_left, bottom_right) in matches:
cv2.rectangle(target, top_left, bottom_right, (0, 0, 255), 2)
plt.imshow(target)
plt.show()
return matches
def find_first_non_white_row(image, y, direction=1, threshold=25, borderSkip=5):
# Get the height of the image
height = image.shape[0]
if direction == 1:
# Start searching upward from y
for row in range(y - borderSkip, 0, -1):
if not (image[row, :] >= (255 - threshold)).all():
return row - 1
return 0
else:
# Start searching downward from y
for row in range(y + borderSkip, height):
if not (image[row, :] >= 255 - threshold).all():
return row + 1
return height
return y
def get_crop_top_bottoms(image, point_pairs):
pairs = []
for (top_left, bottom_right) in point_pairs:
top_bottom = (find_first_non_white_row(
image, top_left[1]), find_first_non_white_row(image, bottom_right[1], direction=0))
pairs.append(top_bottom)
return pairs
def remove_image_part(img, y1, y2):
# Get the width and height of the image
width, height = img.size
# Make sure y1 and y2 are within the image height boundaries
y1 = max(0, min(y1, height))
y2 = max(0, min(y2, height))
# Calculate the new height after removing the specified part
new_height = height - (y2 - y1)
# Create a new blank image with the same width and the new height
new_img = img.copy()
img.paste((255, 255, 255), (0, 0, width, height))
# Paste the top part of the original image (above the specified part)
img.paste(new_img.crop((0, 0, width, y1)), (0, 0))
# Paste the bottom part of the original image (below the specified part)
img.paste(new_img.crop((0, y2, width, height)), (0, y1))
remaining_height = height - (y2 - y1)
img = img.crop((0, 0, width, new_height))
return img
def process_image(input_file, output_file, sample):
img = cv2.imread(input_file)
finds = find_template(sample, img, draw=False)
if len(finds) == 0:
os.rename(input_file, output_file)
return
if len(finds) > 1:
tqdm.write(f"Found {len(finds)} watermark in {input_file}")
crops = get_crop_top_bottoms(img, finds)
img = Image.open(input_file)
for (top, bottom) in crops:
img = remove_image_part(img, top, bottom)
# save image
if img.height == 0:
tqdm.write(
f"Image {input_file} will be empty, creating 1px height blank image.")
img = Image.new("RGB", (img.width, 1), (255, 255, 255))
img.save(output_file, quality=90, resample=0)
os.remove(input_file)
if __name__ == '__main__':
# Create the argument parser
parser = argparse.ArgumentParser(
description='Apply watermark to images in a directory')
# Add the positional arguments
parser.add_argument('watermark_sample_path', help='path to the watermark sample image',
default="watermark_sample.bmp", nargs='?')
parser.add_argument(
'directory_path', help='path to the directory containing the cbz', default="./", nargs='?')
# Parse the command-line arguments
args = parser.parse_args()
cbz = []
# find all cbz in dir
for file_name in os.listdir(args.directory_path):
# Check if the current item is a file (not a directory)
if os.path.isfile(os.path.join(args.directory_path, file_name)):
if file_name.endswith('.cbz'):
cbz.append(file_name)
tqdm.write(f"Found following file to process: \n{cbz}")
tqdm.write(f"Reading watermark sample...")
sample = cv2.imread("watermark_sample.bmp")
sample = crop_blank_space(sample)
plt.imshow(sample)
plt.show()
input_temp_dir = "temp_input"
output_temp_dir = "temp_output"
try:
shutil.rmtree(input_temp_dir)
shutil.rmtree(output_temp_dir)
except:
pass
for c in tqdm(cbz, desc="cbz"):
try:
Path(output_temp_dir).mkdir()
except:
pass
tqdm.write(f"Unpacking {c}...")
with zipfile.ZipFile(c, 'r') as zip_ref:
# Extract all the contents of the zip file
zip_ref.extractall(input_temp_dir)
for im in tqdm(os.listdir(input_temp_dir), desc="img"):
im_path = os.path.join(input_temp_dir, im)
ou_path = os.path.join(output_temp_dir, im)
process_image(im_path, ou_path, sample)
with zipfile.ZipFile(c+"_new", 'w') as zip_file:
# Iterate over all the files in the directory
for root, _, files in os.walk(output_temp_dir):
for file in files:
# Get the full path of the file
file_path = os.path.join(root, file)
# Add the file to the ZIP archive
zip_file.write(file_path, file)
shutil.rmtree(output_temp_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment