Last active
May 12, 2022 20:58
-
-
Save Magnus167/30de7aa86f503ac5228ae8ed15e83525 to your computer and use it in GitHub Desktop.
python script to reduce horizontal white space in images from pdfs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import PIL.Image as Image | |
| import pdf2image | |
| import sys, os, glob | |
| from tqdm import tqdm | |
| def get_files(path, ext='pdf'): | |
| rChar = '/' if sys.platform == 'posix' else '\\' | |
| return [f.split(rChar)[-1] for f in glob.glob(path + '/*.' + ext.lower())] | |
| def pdf_to_images(pdf_path): | |
| pages = pdf2image.convert_from_path(pdf_path) #, dpi=300, fmt='png') | |
| # for pg in range(len(pages)): | |
| # pages[pg].save('out_imgs/' + str(pg) + '.png') | |
| return [pg for pg in pages] | |
| def save_np_arr_as_img(np_arr, img_path): | |
| Image.fromarray(np.uint8(np_arr)).save(img_path) | |
| return True | |
| def iterate_over_rows(img_np_arr): | |
| gRows = [row for row in range(img_np_arr.shape[0]-1) if (np.array_equal(img_np_arr[row], img_np_arr[row+1]))] | |
| gCols = [col for col in range(img_np_arr.shape[1]-1) if (np.array_equal(img_np_arr[:, col], img_np_arr[:, col+1]))] | |
| new_arr = np.delete(img_np_arr, gRows, axis=0) | |
| new_arr = np.delete(new_arr, gCols, axis=1) | |
| return new_arr | |
| def convert(pdfs_path='./pdf_files', out_path='./out_imgs'): | |
| if not os.path.exists(out_path): | |
| os.makedirs(out_path) | |
| for pdfFile in tqdm(get_files(pdfs_path, 'pdf')): | |
| outFile = out_path + '/' + pdfFile[:-4] + '.png' | |
| images = pdf_to_images(pdfs_path + '/' + pdfFile) | |
| for img in tqdm(range(len(images))): | |
| save_np_arr_as_img(iterate_over_rows(np.array(images[img])), outFile+'-'+str(img)+'.png') | |
| return True | |
| convert('./pdf_files', './out_imgs') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment