Created
October 15, 2016 08:24
-
-
Save ImanMousavi/6b8b770dbc0fa0ae25b1be22ec32f519 to your computer and use it in GitHub Desktop.
Removes the annoying watermarks of it-ebooks.info's downloaded eBooks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import re | |
import shutil | |
import argparse | |
import binascii | |
# | |
# WTF: This is a quick tool I've hacked together to easily remove the meta | |
# information as well as the annoying link on each page of eBooks | |
# downloaded from it-ebooks.info. The modified file will hold the | |
# original file name, and the original file will be renamed to | |
# 'original.pdf.old'. 'pattern' is the regex pattern which is used to | |
# remove the annotation elements, the rough structure of it looks | |
# like this: | |
# | |
# obj | |
# << | |
# /Type /Annot | |
# /Subtype /Link | |
# /Rect [ 264 91 348 79 ] # The digits on this line will differ | |
# /Border [ 0 0 0 ] # The same goes for the digits on this line | |
# /A << | |
# /Type /Action | |
# /S /URI | |
# /URI (http://www.it-ebooks.info/) | |
# >> | |
# >> | |
# endobj | |
# | |
pattern = b'''0a2f54797065202f416e6e6f740a2f53756274797065202f4c696e6b0a2f52656 | |
374205b20.*?205d0a2f426f7264657220.*?\n0a2f41203c3c0a2f54797065202f416374696f6e | |
0a2f53202f5552490a2f5552492028687474703a2f2f7777772e69742d65626f6f6b732e696e666 | |
f2f290a3e3e'''.replace(b'\n', b'').strip() | |
def remove_evil_links(pdf_data): | |
'Removes all it-ebook links and metadata from the passed PDF data.' | |
pdf_data = binascii.hexlify(pdf_data) | |
# Remove each annotation element inside the PDF file | |
# (This removes the "clickable" it-ebooks.info links) | |
new_data = re.sub(pattern, b'', pdf_data) | |
# Remove the actual links | |
# (link elements which are assigned to the annotations) | |
new_data = new_data.replace(binascii.hexlify(b'www.it-ebooks.info'), b'') | |
return binascii.unhexlify(new_data) | |
def main(args): | |
try: | |
args.files = list(set(args.files)) | |
for file_path in args.files: | |
if not file_path: | |
continue | |
if args.verbose: | |
print('Processing: {0}'.format(file_path)) | |
try: | |
with open(file_path, 'rb') as input_file: | |
pdf_data = input_file.read() | |
except IOError as e: | |
sys.stderr.write('{0}: {1}\n'.format(file_path, e.strerror)) | |
sys.stderr.flush() | |
continue | |
# Backup the file with a different name | |
if not args.no_backup: | |
if args.verbose: | |
print('Creating backup: {0}.old'.format(file_path)) | |
shutil.move(file_path, '{0}.old'.format(file_path)) | |
# Modify the PDF file | |
new_pdf_data = remove_evil_links(pdf_data) | |
# Save the new file | |
with open(file_path, 'wb') as out_file: | |
out_file.write(new_pdf_data) | |
if args.verbose: | |
print('Saving modified file: {0}'.format(file_path)) | |
except KeyboardInterrupt: | |
pass | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'-f', '--files', | |
help='One or more PDF files to remove it-ebook watermarks.', | |
nargs='*', required=True | |
) | |
parser.add_argument( | |
'-n', '--no-backup', | |
help='Disables the creating of backups for the files ' + | |
'which are being processed.', | |
action='store_true' | |
) | |
parser.add_argument( | |
'-v', '--verbose', | |
action='store_true' | |
) | |
args = parser.parse_args() | |
main(args) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment