Last active
June 2, 2019 15:20
-
-
Save krectra/21a6c7c39238b8b6732d to your computer and use it in GitHub Desktop.
Document Parser API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf8 | |
import os | |
import pyPdf | |
import re | |
from werkzeug import secure_filename | |
# Path to the upload directory | |
UPLOAD_FOLDER = app['UPLOAD_FOLDER'] | |
# Extension that are only accepted for upload | |
ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'docx']) | |
# Extension error message | |
extension_error_message = 'Extension is not allowed.' | |
def upload(): | |
''' | |
Route that will process the file upload | |
@param None | |
''' | |
success = False | |
file = request.vars['file'] | |
apikey = request.get_vars['apikey'] if 'apikey' in request.get_vars else '' | |
user_token = request.get_vars['token'] if 'token' in request.get_vars else '' | |
# Validate apiKey and userToken | |
mngo = Intmongo('volo_new').mongo | |
if mngo.sessions.find({'token':user_token}).count() > 0 and apikey in volo_api_keys: | |
# Get the name of the uploaded file | |
# Check if the file is one of the allowed types/extensions | |
if _allowed_file(file.filename): | |
# Make the filename safe, remove unsupported chars | |
filename = secure_filename(file.filename) | |
# Move the file form the temporal folder to the files/ folder | |
# Redirect the user to the uploaded_file route redirecting to the uploaded file | |
open(os.getcwd()+app['UPLOAD_FOLDER'] + filename, 'wb').write(file.value) | |
return redirect('/volo_api/document/files/'+filename) | |
else: | |
return extension_error_message | |
return json.dumps({'filename':file.filename, 'success':success}) | |
def files(): | |
''' | |
Locates the file on the files/ directory to the browser | |
@param filename - name of the uploaded file | |
''' | |
is_pdf = 'pdf' | |
is_text = 'txt' | |
is_docx = 'docx' | |
filename = request.args[0] | |
# Checking if it is a pdf file | |
if is_pdf in filename: | |
input_file = filename | |
return _get_pdf_content(input_file).encode("ascii", "ignore") | |
# Checking if it is a text file | |
elif is_text in filename: | |
input_file = os.getcwd()+app['UPLOAD_FOLDER'] + str(filename) | |
return _get_text_content(input_file) | |
# Checking if it is a docx file | |
elif is_docx in filename: | |
input_file = os.getcwd()+app['UPLOAD_FOLDER'] +str(filename) | |
return _get_docx_content(input_file) | |
def _get_pdf_content(filename): | |
''' | |
Processes the pdf content to text | |
@param filename - name of the uploaded file | |
''' | |
content = "" | |
pages = "" | |
# Load PDF into pyPDF | |
path = os.getcwd()+app['UPLOAD_FOLDER'] + filename | |
pdf = pyPdf.PdfFileReader(file(path, "rb")) | |
# Iterate pages | |
for i in range(0, pdf.getNumPages()): | |
# Extract text from page and add to content | |
content += pdf.getPage(i).extractText() | |
# Newline implementation | |
content = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", content) | |
pages += content | |
# Removes whitespace | |
content = " ".join(pages.replace(u"\xa0", " ").strip().split()) | |
# Delete the pdf file in files/ folder | |
to_be_del = os.getcwd()+app['UPLOAD_FOLDER'] + '' + filename | |
os.remove(to_be_del) | |
return content | |
def _get_text_content(input_file): | |
''' | |
Processes the txt content | |
@param filename - name of the uploaded file | |
''' | |
new_text_file = "" | |
# Opens the text file | |
text_file = open(input_file,'r') | |
# Reads the text file | |
read_text_file = text_file.read() | |
# Newline implementation | |
read_text_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", read_text_file) | |
new_text_file = read_text_file | |
# Delete the text file in files/ folder | |
os.remove(input_file) | |
return new_text_file | |
def _get_docx_content(input_file): | |
''' | |
Processes the txt content | |
@param filename - name of the uploaded file | |
''' | |
try: | |
from xml.etree.cElementTree import XML | |
except ImportError: | |
from xml.etree.ElementTree import XML | |
import zipfile | |
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' | |
PARA = WORD_NAMESPACE + 'p' | |
TEXT = WORD_NAMESPACE + 't' | |
document = zipfile.ZipFile(input_file) | |
xml_content = document.read('word/document.xml') | |
document.close() | |
tree = XML(xml_content) | |
paragraphs = [] | |
for paragraph in tree.getiterator(PARA): | |
texts = [node.text | |
for node in paragraph.getiterator(TEXT) | |
if node.text] | |
if texts: | |
paragraphs.append(''.join(texts)) | |
new_docx_file = '\n\n'.join(paragraphs) | |
new_docx_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", new_docx_file) | |
# Delete the text file in files/ folder | |
os.remove(input_file) | |
return new_docx_file | |
def _allowed_file(filename): | |
''' | |
Return whether it's an allowed type or not | |
@param filename - name of the uploaded file | |
''' | |
return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment