romaresccoa · January 21, 2020 00:31
diff --git a/extractor.py b/extractor.py
 from PyPDF2 import PdfFileReader


 class Extractor:
    def __init__(self, file_name: str) -> None:
        self.pdf_reader = PdfFileReader(file_name)
        self.sections = self._extract_sections(self.pdf_reader.outlines)
        self.pages_range = self._find_pages_range()
        self.raw_text = self._extract_raw_text()  # useful for creating / testing new functionality

    def _has_subsections(self, section):
        return type(section) is list

    def _propose_section(self, section):
        print('Extracted section title: {}.'.format(section.title))
        option = ''
        while option not in ['Y', 'n']:
            option = input('Do You want to include it in the summary? [Y - yes/n - no]:\n')
        return True if option == 'Y' else False

    def _get_sections(self, section, sections):
        for subsection in section:
            if self._has_subsections(subsection):
                self._get_sections(subsection, sections)
            elif self._propose_section(subsection):
                sections.append(subsection)

    def _extract_sections(self, outlines):
        sections = []
        for section in outlines:
            if not self._has_subsections(section):
                if self._propose_section(section):
                    sections.append(section)
            else:
                self._get_sections(section, sections)
        return sections

    def _find_pages_range(self):
        # TODO: It would be great if someone could
        #       add code to find where references
        #       start so they won't be included
        #       in text of last section.

        # TODO: Test if end variable is really the last page of the document.

        start = self.pdf_reader.getDestinationPageNumber(self.sections[0])
        end = self.pdf_reader.numPages()  # Test if it's surely the last page
        return list(range(start, end))

    def _extract_raw_text(self):
        # TODO: It would be great if someone could
        #       do more preprocessing such as...
        #       deleting tables etc..

        text = ""
        for page in self.pages_range:
            text += self.pdf_reader.getPage(page).extractText()
        return text.replace('\n', '').lower()

    def extract_texts(self):

        """
        Use this method to obtain dictionary with
        section names as keys and extracted texts
        as values.
        """

        result_dict = {}
        text_tmp = self.raw_text
        # get the texts section by section
        for curr_section, next_section in zip(self.sections, self.sections[1:]):
            start_idx = text_tmp.find(curr_section.title.lower()) + len(curr_section.title.lower())
            end_idx = text_tmp[start_idx:].find(next_section.title.lower()) + start_idx
            result_dict[curr_section.title] = text_tmp[start_idx:end_idx]
            text_tmp = text_tmp[end_idx:]
        # this loop didn't include last section
        result_dict[self.sections[-1].title] = text_tmp
        return result_dict

    def get_section_names(self):
        return [section.title for section in self.sections]
	from PyPDF2 import PdfFileReader


	class Extractor:
	def __init__(self, file_name: str) -> None:
	self.pdf_reader = PdfFileReader(file_name)
	self.sections = self._extract_sections(self.pdf_reader.outlines)
	self.pages_range = self._find_pages_range()
	self.raw_text = self._extract_raw_text() # useful for creating / testing new functionality

	def _has_subsections(self, section):
	return type(section) is list

	def _propose_section(self, section):
	print('Extracted section title: {}.'.format(section.title))
	option = ''
	while option not in ['Y', 'n']:
	option = input('Do You want to include it in the summary? [Y - yes/n - no]:\n')
	return True if option == 'Y' else False

	def _get_sections(self, section, sections):
	for subsection in section:
	if self._has_subsections(subsection):
	self._get_sections(subsection, sections)
	elif self._propose_section(subsection):
	sections.append(subsection)

	def _extract_sections(self, outlines):
	sections = []
	for section in outlines:
	if not self._has_subsections(section):
	if self._propose_section(section):
	sections.append(section)
	else:
	self._get_sections(section, sections)
	return sections

	def _find_pages_range(self):
	# TODO: It would be great if someone could
	# add code to find where references
	# start so they won't be included
	# in text of last section.

	# TODO: Test if end variable is really the last page of the document.

	start = self.pdf_reader.getDestinationPageNumber(self.sections[0])
	end = self.pdf_reader.numPages() # Test if it's surely the last page
	return list(range(start, end))

	def _extract_raw_text(self):
	# TODO: It would be great if someone could
	# do more preprocessing such as...
	# deleting tables etc..

	text = ""
	for page in self.pages_range:
	text += self.pdf_reader.getPage(page).extractText()
	return text.replace('\n', '').lower()

	def extract_texts(self):

	"""
	Use this method to obtain dictionary with
	section names as keys and extracted texts
	as values.
	"""

	result_dict = {}
	text_tmp = self.raw_text
	# get the texts section by section
	for curr_section, next_section in zip(self.sections, self.sections[1:]):
	start_idx = text_tmp.find(curr_section.title.lower()) + len(curr_section.title.lower())
	end_idx = text_tmp[start_idx:].find(next_section.title.lower()) + start_idx
	result_dict[curr_section.title] = text_tmp[start_idx:end_idx]
	text_tmp = text_tmp[end_idx:]
	# this loop didn't include last section
	result_dict[self.sections[-1].title] = text_tmp
	return result_dict

	def get_section_names(self):
	return [section.title for section in self.sections]