oliver-batey · January 21, 2021 15:58
diff --git a/common_interface.py b/common_interface.py
 import os 
 import io
 from docx import Document

 from pdfminer3.layout import LAParams, LTTextBox
 from pdfminer3.pdfpage import PDFPage
 from pdfminer3.pdfinterp import PDFResourceManager
 from pdfminer3.pdfinterp import PDFPageInterpreter
 from pdfminer3.converter import PDFPageAggregator
 from pdfminer3.converter import TextConverter

 from bs4 import BeautifulSoup

 from pptx import Presentation


 class DocParser:
    def parse(self,document):
        parser = get_format(document)
        return parser(document)

 def get_format(document):
    format = os.path.splitext(document)[-1]
    return get_parser(format)    

 def get_parser(format):
    if format == '.txt':
        return parse_txt
    elif format == '.docx':
        return parse_docx
    elif format == '.pdf':
        return parse_pdf
    elif format == '.html':
        return parse_html
    elif format == '.pptx':
        return parse_pptx
    else:
        raise ValueError(format)
        
 def parse_txt(document):
    with open(document, 'r') as file:
        sting = file.read().replace('\n', ' ')
    return string  
 
 def parse_docx(document):
    doc = Document(document)
    string = ''
    for para in doc.paragraphs:
        string += f'{para.text} '
    return string

 def parse_pdf(document):
    resource_manager = PDFResourceManager()
    file_handle = io.StringIO()
    converter = TextConverter(resource_manager, file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(document, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        string = file_handle.getvalue()   
    converter.close()
    file_handle.close()
    return string

 def parse_html(document):
    html = open(document)
    soup = BeautifulSoup(html, features="html.parser")
    for script in soup(["script", "style"]):
        script.extract()
    return soup.get_text()

 def parse_pptx(document):
    pres = Presentation(document)
    string = ''
    for slide in pres.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                string+=f'{shape.text} '
    return string
	import os
	import io
	from docx import Document

	from pdfminer3.layout import LAParams, LTTextBox
	from pdfminer3.pdfpage import PDFPage
	from pdfminer3.pdfinterp import PDFResourceManager
	from pdfminer3.pdfinterp import PDFPageInterpreter
	from pdfminer3.converter import PDFPageAggregator
	from pdfminer3.converter import TextConverter

	from bs4 import BeautifulSoup

	from pptx import Presentation


	class DocParser:
	def parse(self,document):
	parser = get_format(document)
	return parser(document)

	def get_format(document):
	format = os.path.splitext(document)[-1]
	return get_parser(format)

	def get_parser(format):
	if format == '.txt':
	return parse_txt
	elif format == '.docx':
	return parse_docx
	elif format == '.pdf':
	return parse_pdf
	elif format == '.html':
	return parse_html
	elif format == '.pptx':
	return parse_pptx
	else:
	raise ValueError(format)

	def parse_txt(document):
	with open(document, 'r') as file:
	sting = file.read().replace('\n', ' ')
	return string

	def parse_docx(document):
	doc = Document(document)
	string = ''
	for para in doc.paragraphs:
	string += f'{para.text} '
	return string

	def parse_pdf(document):
	resource_manager = PDFResourceManager()
	file_handle = io.StringIO()
	converter = TextConverter(resource_manager, file_handle, laparams=LAParams())
	page_interpreter = PDFPageInterpreter(resource_manager, converter)
	with open(document, 'rb') as fh:
	for page in PDFPage.get_pages(fh,
	caching=True,
	check_extractable=True):
	page_interpreter.process_page(page)
	string = file_handle.getvalue()
	converter.close()
	file_handle.close()
	return string

	def parse_html(document):
	html = open(document)
	soup = BeautifulSoup(html, features="html.parser")
	for script in soup(["script", "style"]):
	script.extract()
	return soup.get_text()

	def parse_pptx(document):
	pres = Presentation(document)
	string = ''
	for slide in pres.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	string+=f'{shape.text} '
	return string