Created
July 11, 2023 07:02
-
-
Save tamnguyenvan/75f2b538910770292acd479b506907c3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import shutil | |
| import os | |
| from pathlib import Path | |
| import cv2 | |
| def load_files(dir: str, ext: str): | |
| paths = [] | |
| for root, dirs, files in os.walk(dir): | |
| for file in files: | |
| if file not in ('.', '..'): | |
| path = os.path.join(root, file) | |
| if os.path.splitext(path)[1] == ext: | |
| paths.append(path) | |
| return paths | |
| data_dir = Path('.') | |
| ann_dir = data_dir / 'Annotations' | |
| image_dir = data_dir / 'Images' | |
| ann_files = load_files(ann_dir, '.json') | |
| import tqdm | |
| import random | |
| import json | |
| from typing import List, Dict | |
| def merge_ann_files(ann_files: List[str]): | |
| images = [] | |
| annotations = [] | |
| categories = [] | |
| image_id = 1 | |
| ann_id = 1 | |
| image2anns = dict() | |
| for ann_file in ann_files: | |
| ann_data = json.load(open(ann_file)) | |
| ann_images = ann_data['images'] | |
| ann_anns = ann_data['annotations'] | |
| imageid_map = dict() | |
| tmp_images = [] | |
| for image in ann_images: | |
| current_image_id = image['id'] | |
| imageid_map[current_image_id] = image_id | |
| image['id'] = image_id | |
| image_id += 1 | |
| tmp_images.append(image) | |
| for ann in ann_anns: | |
| if 'attributes' in ann and 'Number' in ann['attributes']: | |
| text = ann['attributes']['Number'] | |
| if all([ch in '0123456789' for ch in text]): | |
| x, y, w, h = ann['bbox'] | |
| if w < 20 or h < 20: | |
| continue | |
| current_image_id = ann['image_id'] | |
| img_id = imageid_map[current_image_id] | |
| ann['id'] = ann_id | |
| ann['image_id'] = img_id | |
| ann_id += 1 | |
| ann['text'] = text | |
| ann['category_id'] = 1 | |
| annotations.append(ann) | |
| if img_id not in image2anns: | |
| image2anns[img_id] = [] | |
| image2anns[img_id].append(ann) | |
| for image in tmp_images: | |
| if image['id'] in image2anns: | |
| images.append(image) | |
| # categories = ann_data['categories'] | |
| categories = [{"supercategory": "none", "name": 'text', "id": 1}] | |
| print(len(images), len(image2anns)) | |
| return images, image2anns, categories | |
| def save_ann_file(images: List, image2anns: Dict, image_dir: str, outdir: str, split: str): | |
| image_outdir = os.path.join(outdir, split) | |
| os.makedirs(image_outdir, exist_ok=True) | |
| print(f'Creating annotation file: {outdir}') | |
| label_path = os.path.join(outdir, f'{split}.txt') | |
| with open(label_path, 'wt') as f: | |
| for image in tqdm.tqdm(images): | |
| image_id = image['id'] | |
| image_src_path = os.path.join(image_dir, image['file_name']) | |
| img = cv2.imread(image_src_path) | |
| label_filename = f'{image_id:08d}.txt' | |
| anns = image2anns[image_id] | |
| image_w, image_h = image['width'], image['height'] | |
| if len(anns): | |
| for i, ann in enumerate(anns): | |
| x, y, w, h = list(map(int, ann['bbox'])) | |
| crop = img[y:y+h, x:x+w, :] | |
| crop_filename = f'{image_id:08d}_box_{i+1}.jpg' | |
| crop_dst_path = os.path.join(image_outdir, crop_filename) | |
| cv2.imwrite(crop_dst_path, crop) | |
| text = ann['text'] | |
| f.write(f'{crop_filename}\t{text}\n') | |
| images, image2anns, categories = merge_ann_files(ann_files) | |
| test_size = 0.2 | |
| random.seed(12) | |
| num_train = int(len(images) * (1 - test_size)) | |
| random.shuffle(images) | |
| train_images, test_images = images[:num_train], images[num_train:] | |
| print(f'Train/Test split: {len(train_images)}/{len(test_images)}') | |
| ann_outdir = 'jersey-text-recognition/' | |
| !rm -rf {ann_outdir} | |
| save_ann_file(train_images, image2anns, image_dir, ann_outdir, 'train') | |
| save_ann_file(test_images, image2anns, image_dir, ann_outdir, 'test') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment