Created
September 19, 2020 19:41
-
-
Save Pop101/53797c0438f372e59bdc1885306f3fff to your computer and use it in GitHub Desktop.
A commandline interface for extracting a pdf's usable text into json files, separated by chapter.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from utils import * | |
from pdfminer.high_level import extract_pages | |
from pdfminer.layout import LTTextContainer, LTChar, LTPage, LTTextLine | |
def avg_char_height(container:LTTextContainer): | |
chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20]) | |
char_size = list(map(lambda c: c.size, chars)) | |
if len(char_size) > 0: return sum(char_size)/len(char_size) | |
return 0 | |
def most_common_font(page:LTPage): | |
chars = n_sample(page, 3, required_types=[LTTextContainer, LTTextLine, LTChar], max_samples=[50, 3, 20]) | |
if len(chars) <= 0: return '' | |
chars = [char.fontname for char in chars] | |
return Freq_list(chars)[-1] | |
def get_font(container:LTTextContainer): | |
chars = n_sample(container, 2, required_types=[LTTextLine, LTChar], max_samples=[4, 20]) | |
chars = [char.fontname for char in chars] | |
if len(chars) > 0: | |
return Freq_list(chars)[-1] | |
return '' | |
def parse_pdf(path:str, chapter_threshold:float=4.5, verbose=False): | |
if verbose: print('Getting most common font... ') | |
font, font_size = Freq_list(), Freq_list() | |
for page in sample_list(extract_pages(path, maxpages=120), max_samples=120, fast=True): | |
font.add(most_common_font(page)) | |
font_size.add_all([str(int(round(avg_char_height(c)))) for c in sample_list(page, fast=True, required_type=LTTextContainer)]) | |
if verbose: print('Most common font: '+str(font[-1])+' with size '+str(font_size[-1])) | |
if verbose and len(font) > 3: print('The three most common fonts are '+str(font[-3:])) | |
cptr_list = list() | |
cptr_txt = ["", "", ""] | |
for page_layout in extract_pages(path): | |
font.add | |
for element in page_layout: | |
if isinstance(element, LTTextContainer): | |
# If this element is a title (5x as big as normal text) | |
if avg_char_height(element) > chapter_threshold*int(font_size[-1]): | |
if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0: | |
cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]}) | |
if verbose: print('Finished chapter '+str(cptr_txt[0:2])) | |
if verbose: print('Words in chapter '+str(cptr_txt[2].count(' '))) | |
cptr_txt[0] = element.get_text().encode('ascii', errors='ignore').strip().decode(errors='ignore') | |
cptr_txt[1] = str(page_layout.pageid) | |
cptr_txt[2] = "" | |
# If this element has the correct font and similar size, add it to the chapter | |
if len(font) > 0 and font[-1] in get_font(element) and abs(int(font_size[-1]) - avg_char_height(element)) < 2: | |
cptr_txt[2] += element.get_text().encode('utf-7',errors='ignore').strip().decode(errors='ignore') | |
if len(cptr_txt[1]) > 0 and cptr_txt[2].count(' ') > 0: | |
cptr_list.append({'chapter': cptr_txt[0],'page':cptr_txt[1],'contents':cptr_txt[2]}) | |
if verbose: print('Finished chapter '+str(cptr_txt[0:2])) | |
if verbose: print('Words in chapter '+str(cptr_txt[2].count(' '))) | |
return cptr_list | |
import sys, getopt, os, json | |
from pathlib import Path | |
def mutate_file_extension(audio_path:Path, ext:str='.txt', dir:str='./pdfs'): | |
if len(dir) <= 1: # if no dir is given, put them in the same dir the original is in | |
return audio_path.parent / Path(audio_path.stem + ext) | |
os.makedirs(dir, exist_ok=True) | |
return Path(dir) / Path(audio_path.stem + ext) | |
HELP_STR = """Usage: | |
python3 pdfextract.py <pdf 1> <pdf 2> .... | |
Note that every hundred pages takes about 1 minute to parse! | |
\nCommand Line Options: | |
-h --help: Prints this. Ignores all other options. | |
-c --clean: Deletes the default or the set directory. Ignores all other options. | |
-i --inputfile: Defines a list of subreddits (1 per line) to go through (instead of args). | |
-o --outputdir: The output directory. The same dir as the given pdfs if not supplied. | |
-t --threshold: How many times bigger text needs to be from surrounding text to be a chapter. Defaults to 4.5 | |
-s --single: All outputs be concaternated into a single file (pdfs.txt) | |
-b --bypass: Skips the pdf if the target file already exists. | |
-v --verbose: Enables more detailed printing. | |
""" | |
def main(argv): | |
pdfpaths = [] | |
threshold = 4.5 | |
outputdir = '' | |
single = False | |
bypass = False | |
verbose = False | |
# Add all pdf paths | |
for arg in argv: | |
if not str(arg).startswith('-'): | |
pdfpaths.append(arg) | |
argv.remove(arg) | |
try: | |
opts, args = getopt.getopt(argv,"hi:o:sbvct",["help","inputfile=","outputfile=","single","bypass","verbose","clean","threshold="]) | |
except: | |
print(HELP_STR) | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt in ('-h','--help'): | |
print(HELP_STR) | |
sys.exit(0) | |
elif opt in ('-c', '--clean'): | |
if os.path.exists(outputdir): | |
import shutil | |
shutil.rmtree(outputdir) | |
sys.exit(0) | |
elif opt in ('-s', '--single'): | |
single = not single | |
elif opt in ('-b', '--bypass'): | |
bypass = not bypass | |
elif opt in ('-v', '--verbose'): | |
verbose = not verbose | |
elif opt in ('-o','--outputdir'): | |
outputdir = arg | |
elif opt in ('-t', '--threshold'): | |
try: | |
threshold = float(opt) | |
except ValueError: | |
print('Invalid threshold! Must be a valid number') | |
sys.exit(2) | |
elif opt in ('-i', '--inputfile'): | |
if not os.path.exists(arg): | |
print('Input file invalid!') | |
sys.exit(2) | |
with open(arg, 'r') as file: | |
pdfpaths.extend(file.readlines()) | |
# Catch no options given | |
if len(pdfpaths) <= 0: | |
print(HELP_STR) | |
sys.exit(0) | |
# Make working directory | |
if len(outputdir) > 1 and not os.path.exists(outputdir): | |
os.makedirs(outputdir) | |
# Loop through all pdfs | |
pdf_dict = dict() | |
for pdf in pdfpaths: | |
if verbose: print('Beginning pdf "'+str(pdf)+'"') | |
pdf_path = Path(pdf) | |
if not pdf_path.exists(): print('Error: pdf "'+pdf+'" does not exist') | |
if mutate_file_extension(pdf_path, dir=outputdir).exists() and bypass: | |
if verbose: print('Skipping pdf!') | |
continue | |
extracted_pdf = parse_pdf(pdf, verbose=verbose) | |
if single: | |
pdf_dict[pdf] = extracted_pdf | |
else: | |
extracted_pdf_dict = {'name':pdf_path.stem,'content': extracted_pdf} | |
with open(os.fspath(mutate_file_extension(pdf_path, dir=outputdir)),'w') as file: | |
json.dump(extracted_pdf_dict, file) | |
if single: | |
singleFile = os.path.join(outputdir,'pdfs.txt') | |
with open(singleFile, 'w') as file: | |
json.dump(pdf_dict, file) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) | |
# To load: | |
# import ast | |
# ast.literal_eval(str) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import collections.abc | |
class Freq_list: | |
def __init__(self, base:iter=None): | |
self.tracker = dict() | |
if isinstance(base, collections.abc.Iterable): self.add_all(base) | |
def add_all(self, iterable:iter): | |
assert(isinstance(iterable, collections.abc.Iterable)) | |
for i in iterable: | |
self.add(i) | |
def add(self, element, value=1): | |
if element in self.tracker: | |
self.tracker[element] += value | |
else: | |
self.tracker[element] = value | |
def __iter__(self): | |
if len(self.tracker) <= 0: return iter([]) | |
return iter([t[0] for t in sorted(self.tracker.items(), key=lambda kv: (kv[1], kv[0]))]) | |
def __add__(self, other): | |
assert isinstance(other, collections.abc.Iterable) | |
combined = self.copy() | |
for element in other: | |
if isinstance(other, Freq_list): combined.add(element, value=other.get_freq(element)) | |
else: combined.add(element) | |
return combined | |
def copy(self): | |
cp = Freq_list() | |
for i in self: | |
cp.add(i, value=self.get_freq(i)) | |
return cp | |
def __dict__(self): | |
return self.tracker.copy() | |
def __len__(self): | |
return sum(self.tracker.values()) | |
def get_freq(self, element): | |
if element in self.tracker: | |
return self.tracker[element] | |
return 0 | |
def __getitem__(self, index): | |
try: | |
return list(self)[index] | |
except IndexError: | |
return None | |
def n_sample(collection,samples:int, max_samples = 5, fast=True, required_types=[]): | |
required_types.extend([None]*samples) | |
if not isinstance(max_samples, list): max_samples = [max_samples] * samples | |
max_samples.extend([max_samples[-1]]*samples) | |
sample = sample_list(collection, max_samples=max_samples[0], required_type=required_types[0], fast=fast) | |
for i in range(1, samples): | |
sample = [sample_list(obj, max_samples=max_samples[i], required_type=required_types[i], fast=fast) for obj in sample] | |
sample = list(itertools.chain.from_iterable(sample)) | |
return sample | |
def sample_list(collection,max_samples=20,fast=False, required_type=None): | |
if required_type != None and not isinstance(required_type, type): | |
required_type = type(required_type) | |
samples = list() | |
if not fast: | |
collection = list(collection) | |
i = 0 | |
while i < len(collection): | |
if required_type == None or isinstance(collection[i], required_type): | |
samples.append(collection[i]) | |
elif required_type != None: | |
i += 1 | |
else: | |
i += max(1, int(len(collection)/max_samples)) | |
else: | |
for element in collection: | |
if required_type == None or isinstance(element, required_type): | |
samples.append(element) | |
#print(element) | |
if len(samples) >= max_samples: break | |
return samples |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment