Last active
January 21, 2020 00:31
-
-
Save romaresccoa/218c452d8ca6198d9fc17319a381a63a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader | |
class Extractor: | |
def __init__(self, file_name: str) -> None: | |
self.pdf_reader = PdfFileReader(file_name) | |
self.sections = self._extract_sections(self.pdf_reader.outlines) | |
self.pages_range = self._find_pages_range() | |
self.raw_text = self._extract_raw_text() # useful for creating / testing new functionality | |
def _has_subsections(self, section): | |
return type(section) is list | |
def _propose_section(self, section): | |
print('Extracted section title: {}.'.format(section.title)) | |
option = '' | |
while option not in ['Y', 'n']: | |
option = input('Do You want to include it in the summary? [Y - yes/n - no]:\n') | |
return True if option == 'Y' else False | |
def _get_sections(self, section, sections): | |
for subsection in section: | |
if self._has_subsections(subsection): | |
self._get_sections(subsection, sections) | |
elif self._propose_section(subsection): | |
sections.append(subsection) | |
def _extract_sections(self, outlines): | |
sections = [] | |
for section in outlines: | |
if not self._has_subsections(section): | |
if self._propose_section(section): | |
sections.append(section) | |
else: | |
self._get_sections(section, sections) | |
return sections | |
def _find_pages_range(self): | |
# TODO: It would be great if someone could | |
# add code to find where references | |
# start so they won't be included | |
# in text of last section. | |
# TODO: Test if end variable is really the last page of the document. | |
start = self.pdf_reader.getDestinationPageNumber(self.sections[0]) | |
end = self.pdf_reader.numPages() # Test if it's surely the last page | |
return list(range(start, end)) | |
def _extract_raw_text(self): | |
# TODO: It would be great if someone could | |
# do more preprocessing such as... | |
# deleting tables etc.. | |
text = "" | |
for page in self.pages_range: | |
text += self.pdf_reader.getPage(page).extractText() | |
return text.replace('\n', '').lower() | |
def extract_texts(self): | |
""" | |
Use this method to obtain dictionary with | |
section names as keys and extracted texts | |
as values. | |
""" | |
result_dict = {} | |
text_tmp = self.raw_text | |
# get the texts section by section | |
for curr_section, next_section in zip(self.sections, self.sections[1:]): | |
start_idx = text_tmp.find(curr_section.title.lower()) + len(curr_section.title.lower()) | |
end_idx = text_tmp[start_idx:].find(next_section.title.lower()) + start_idx | |
result_dict[curr_section.title] = text_tmp[start_idx:end_idx] | |
text_tmp = text_tmp[end_idx:] | |
# this loop didn't include last section | |
result_dict[self.sections[-1].title] = text_tmp | |
return result_dict | |
def get_section_names(self): | |
return [section.title for section in self.sections] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment