yig · October 25, 2024 03:53
diff --git a/sig2text.py b/sig2text.py
 '''
 # Author: Yotam Gingold <[email protected]>
 # License: CC0
 # URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>

 ## About

 Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.

 ## Install

    pip install pypdf==4.0.2

 To remove hyphenation:

    pip install spacy==3.7.4
    python -m spacy download en_core_web_sm

 ## Usage

 By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.

    python3 sig2text.py file.pdf [out.txt]
 '''

 import argparse
 parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
 parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
 parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
 parser.add_argument( '-metrics', type = str, choices = ['ACM', 'CGF'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF. Default is ACM.' )
 args = parser.parse_args()

 from pypdf import PdfReader
 print( f"Loading:", args.inpath )
 reader = PdfReader( args.inpath )

 print( f"Using {args.metrics} metrics." )

 metrics = [
    ## Column 1
    # left, right, top, bottom
    {
        'ACM': [ 43, 313, 75, 705 ],
        'CGF': [ 47, 292, 78, 726 ]
    },
    ## Column 2
    # left, right, top, bottom
    {
        'ACM': [ 313, 575, 75, 705 ],
        'CGF': [ 311, 557, 78, 726 ]
    }
    ]

 parts = []
 for page_index, page in enumerate( reader.pages ):
    
    def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ):
        x, y = tm_matrix[4:6]
        # y is from the bottom, so flip it
        y = page.mediabox[3] - y
        
        ## Keep only what's inside the crop box
        if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]: return
        
        ## Check for the noise on the first page
        if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ):
            crop[3] = y
            return
        
        ## Skip small solo numbers in CGF
        if args.metrics in ('CGF',) and text.strip().isdigit() and font_size == 5.3798:
            return

        parts.append(text)
    
    ## Column 1
    # left, right, top, bottom
    crop = metrics[0][args.metrics]
    page.extract_text( visitor_text = visit_crop )
    if len( parts ) > 0: parts[-1] += '\n'
    
    ## Column 2
    # left, right, top, bottom
    crop = metrics[1][args.metrics]
    page.extract_text( visitor_text = visit_crop )
    if len( parts ) > 0: parts[-1] += '\n'

 text_body = "".join( parts )

 ## Remove hyphenation
 REMOVE_HYPHENATION = False
 try:
    import spacy
    # Load the English NLP model
    nlp = spacy.load("en_core_web_sm")
    # Let's remove hyphenation!
    REMOVE_HYPHENATION = True
 except: pass

 if REMOVE_HYPHENATION:
    def is_english(word):
        doc = nlp(word)
        # Check if the language of the word is English
        return doc.lang_ == "en"
    
    import re
    pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)')
    
    def replace_if_english( match ):
        dehyphen = '\n' + match.group(1) + match.group(2)
        return dehyphen if is_english( dehyphen ) else match.group(0)
    
    original_text_body = text_body
    text_body = pattern.sub( replace_if_english, original_text_body )

 if args.outpath is None:
    from pathlib import Path
    args.outpath = Path(args.inpath).with_suffix( '.txt' )

 if args.outpath == '-':
    print( text_body )
 else:
    with open( args.outpath, 'w' ) as f: f.write( text_body )
    print( f"Saved:", args.outpath )
	'''
	# Author: Yotam Gingold <[email protected]>
	# License: CC0
	# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>

	## About

	Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.

	## Install

	pip install pypdf==4.0.2

	To remove hyphenation:

	pip install spacy==3.7.4
	python -m spacy download en_core_web_sm

	## Usage

	By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.

	python3 sig2text.py file.pdf [out.txt]
	'''

	import argparse
	parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
	parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
	parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
	parser.add_argument( '-metrics', type = str, choices = ['ACM', 'CGF'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF. Default is ACM.' )
	args = parser.parse_args()

	from pypdf import PdfReader
	print( f"Loading:", args.inpath )
	reader = PdfReader( args.inpath )

	print( f"Using {args.metrics} metrics." )

	metrics = [
	## Column 1
	# left, right, top, bottom
	{
	'ACM': [ 43, 313, 75, 705 ],
	'CGF': [ 47, 292, 78, 726 ]
	},
	## Column 2
	# left, right, top, bottom
	{
	'ACM': [ 313, 575, 75, 705 ],
	'CGF': [ 311, 557, 78, 726 ]
	}
	]

	parts = []
	for page_index, page in enumerate( reader.pages ):

	def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ):
	x, y = tm_matrix[4:6]
	# y is from the bottom, so flip it
	y = page.mediabox[3] - y

	## Keep only what's inside the crop box
	if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]: return

	## Check for the noise on the first page
	if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ):
	crop[3] = y
	return

	## Skip small solo numbers in CGF
	if args.metrics in ('CGF',) and text.strip().isdigit() and font_size == 5.3798:
	return

	parts.append(text)

	## Column 1
	# left, right, top, bottom
	crop = metrics[0][args.metrics]
	page.extract_text( visitor_text = visit_crop )
	if len( parts ) > 0: parts[-1] += '\n'

	## Column 2
	# left, right, top, bottom
	crop = metrics[1][args.metrics]
	page.extract_text( visitor_text = visit_crop )
	if len( parts ) > 0: parts[-1] += '\n'

	text_body = "".join( parts )

	## Remove hyphenation
	REMOVE_HYPHENATION = False
	try:
	import spacy
	# Load the English NLP model
	nlp = spacy.load("en_core_web_sm")
	# Let's remove hyphenation!
	REMOVE_HYPHENATION = True
	except: pass

	if REMOVE_HYPHENATION:
	def is_english(word):
	doc = nlp(word)
	# Check if the language of the word is English
	return doc.lang_ == "en"

	import re
	pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)')

	def replace_if_english( match ):
	dehyphen = '\n' + match.group(1) + match.group(2)
	return dehyphen if is_english( dehyphen ) else match.group(0)

	original_text_body = text_body
	text_body = pattern.sub( replace_if_english, original_text_body )

	if args.outpath is None:
	from pathlib import Path
	args.outpath = Path(args.inpath).with_suffix( '.txt' )

	if args.outpath == '-':
	print( text_body )
	else:
	with open( args.outpath, 'w' ) as f: f.write( text_body )
	print( f"Saved:", args.outpath )