Last active
October 25, 2024 03:53
-
-
Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Converts a PDF file assumed to be a two-column ACM or CGF article to text. Ignores reviewer red numbering.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
# Author: Yotam Gingold <[email protected]> | |
# License: CC0 | |
# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd> | |
## About | |
Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering. | |
## Install | |
pip install pypdf==4.0.2 | |
To remove hyphenation: | |
pip install spacy==3.7.4 | |
python -m spacy download en_core_web_sm | |
## Usage | |
By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout. | |
python3 sig2text.py file.pdf [out.txt] | |
''' | |
import argparse | |
parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' ) | |
parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' ) | |
parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' ) | |
parser.add_argument( '-metrics', type = str, choices = ['ACM', 'CGF'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF. Default is ACM.' ) | |
args = parser.parse_args() | |
from pypdf import PdfReader | |
print( f"Loading:", args.inpath ) | |
reader = PdfReader( args.inpath ) | |
print( f"Using {args.metrics} metrics." ) | |
metrics = [ | |
## Column 1 | |
# left, right, top, bottom | |
{ | |
'ACM': [ 43, 313, 75, 705 ], | |
'CGF': [ 47, 292, 78, 726 ] | |
}, | |
## Column 2 | |
# left, right, top, bottom | |
{ | |
'ACM': [ 313, 575, 75, 705 ], | |
'CGF': [ 311, 557, 78, 726 ] | |
} | |
] | |
parts = [] | |
for page_index, page in enumerate( reader.pages ): | |
def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ): | |
x, y = tm_matrix[4:6] | |
# y is from the bottom, so flip it | |
y = page.mediabox[3] - y | |
## Keep only what's inside the crop box | |
if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]: return | |
## Check for the noise on the first page | |
if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ): | |
crop[3] = y | |
return | |
## Skip small solo numbers in CGF | |
if args.metrics in ('CGF',) and text.strip().isdigit() and font_size == 5.3798: | |
return | |
parts.append(text) | |
## Column 1 | |
# left, right, top, bottom | |
crop = metrics[0][args.metrics] | |
page.extract_text( visitor_text = visit_crop ) | |
if len( parts ) > 0: parts[-1] += '\n' | |
## Column 2 | |
# left, right, top, bottom | |
crop = metrics[1][args.metrics] | |
page.extract_text( visitor_text = visit_crop ) | |
if len( parts ) > 0: parts[-1] += '\n' | |
text_body = "".join( parts ) | |
## Remove hyphenation | |
REMOVE_HYPHENATION = False | |
try: | |
import spacy | |
# Load the English NLP model | |
nlp = spacy.load("en_core_web_sm") | |
# Let's remove hyphenation! | |
REMOVE_HYPHENATION = True | |
except: pass | |
if REMOVE_HYPHENATION: | |
def is_english(word): | |
doc = nlp(word) | |
# Check if the language of the word is English | |
return doc.lang_ == "en" | |
import re | |
pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)') | |
def replace_if_english( match ): | |
dehyphen = '\n' + match.group(1) + match.group(2) | |
return dehyphen if is_english( dehyphen ) else match.group(0) | |
original_text_body = text_body | |
text_body = pattern.sub( replace_if_english, original_text_body ) | |
if args.outpath is None: | |
from pathlib import Path | |
args.outpath = Path(args.inpath).with_suffix( '.txt' ) | |
if args.outpath == '-': | |
print( text_body ) | |
else: | |
with open( args.outpath, 'w' ) as f: f.write( text_body ) | |
print( f"Saved:", args.outpath ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment