Last active
January 21, 2021 15:58
-
-
Save oliver-batey/8e125cd8d4c3f7ca2df8fde002fa0a30 to your computer and use it in GitHub Desktop.
Common interface for parsing txt, docx, pdf, html and pptx
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import io | |
from docx import Document | |
from pdfminer3.layout import LAParams, LTTextBox | |
from pdfminer3.pdfpage import PDFPage | |
from pdfminer3.pdfinterp import PDFResourceManager | |
from pdfminer3.pdfinterp import PDFPageInterpreter | |
from pdfminer3.converter import PDFPageAggregator | |
from pdfminer3.converter import TextConverter | |
from bs4 import BeautifulSoup | |
from pptx import Presentation | |
class DocParser: | |
def parse(self,document): | |
parser = get_format(document) | |
return parser(document) | |
def get_format(document): | |
format = os.path.splitext(document)[-1] | |
return get_parser(format) | |
def get_parser(format): | |
if format == '.txt': | |
return parse_txt | |
elif format == '.docx': | |
return parse_docx | |
elif format == '.pdf': | |
return parse_pdf | |
elif format == '.html': | |
return parse_html | |
elif format == '.pptx': | |
return parse_pptx | |
else: | |
raise ValueError(format) | |
def parse_txt(document): | |
with open(document, 'r') as file: | |
sting = file.read().replace('\n', ' ') | |
return string | |
def parse_docx(document): | |
doc = Document(document) | |
string = '' | |
for para in doc.paragraphs: | |
string += f'{para.text} ' | |
return string | |
def parse_pdf(document): | |
resource_manager = PDFResourceManager() | |
file_handle = io.StringIO() | |
converter = TextConverter(resource_manager, file_handle, laparams=LAParams()) | |
page_interpreter = PDFPageInterpreter(resource_manager, converter) | |
with open(document, 'rb') as fh: | |
for page in PDFPage.get_pages(fh, | |
caching=True, | |
check_extractable=True): | |
page_interpreter.process_page(page) | |
string = file_handle.getvalue() | |
converter.close() | |
file_handle.close() | |
return string | |
def parse_html(document): | |
html = open(document) | |
soup = BeautifulSoup(html, features="html.parser") | |
for script in soup(["script", "style"]): | |
script.extract() | |
return soup.get_text() | |
def parse_pptx(document): | |
pres = Presentation(document) | |
string = '' | |
for slide in pres.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
string+=f'{shape.text} ' | |
return string |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment