Skip to content

Instantly share code, notes, and snippets.

@krectra
Last active June 2, 2019 15:20
Show Gist options
  • Save krectra/21a6c7c39238b8b6732d to your computer and use it in GitHub Desktop.
Save krectra/21a6c7c39238b8b6732d to your computer and use it in GitHub Desktop.
Document Parser API
# coding: utf8
import os
import pyPdf
import re
from werkzeug import secure_filename
# Path to the upload directory
UPLOAD_FOLDER = app['UPLOAD_FOLDER']
# Extension that are only accepted for upload
ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'docx'])
# Extension error message
extension_error_message = 'Extension is not allowed.'
def upload():
'''
Route that will process the file upload
@param None
'''
success = False
file = request.vars['file']
apikey = request.get_vars['apikey'] if 'apikey' in request.get_vars else ''
user_token = request.get_vars['token'] if 'token' in request.get_vars else ''
# Validate apiKey and userToken
mngo = Intmongo('volo_new').mongo
if mngo.sessions.find({'token':user_token}).count() > 0 and apikey in volo_api_keys:
# Get the name of the uploaded file
# Check if the file is one of the allowed types/extensions
if _allowed_file(file.filename):
# Make the filename safe, remove unsupported chars
filename = secure_filename(file.filename)
# Move the file form the temporal folder to the files/ folder
# Redirect the user to the uploaded_file route redirecting to the uploaded file
open(os.getcwd()+app['UPLOAD_FOLDER'] + filename, 'wb').write(file.value)
return redirect('/volo_api/document/files/'+filename)
else:
return extension_error_message
return json.dumps({'filename':file.filename, 'success':success})
def files():
'''
Locates the file on the files/ directory to the browser
@param filename - name of the uploaded file
'''
is_pdf = 'pdf'
is_text = 'txt'
is_docx = 'docx'
filename = request.args[0]
# Checking if it is a pdf file
if is_pdf in filename:
input_file = filename
return _get_pdf_content(input_file).encode("ascii", "ignore")
# Checking if it is a text file
elif is_text in filename:
input_file = os.getcwd()+app['UPLOAD_FOLDER'] + str(filename)
return _get_text_content(input_file)
# Checking if it is a docx file
elif is_docx in filename:
input_file = os.getcwd()+app['UPLOAD_FOLDER'] +str(filename)
return _get_docx_content(input_file)
def _get_pdf_content(filename):
'''
Processes the pdf content to text
@param filename - name of the uploaded file
'''
content = ""
pages = ""
# Load PDF into pyPDF
path = os.getcwd()+app['UPLOAD_FOLDER'] + filename
pdf = pyPdf.PdfFileReader(file(path, "rb"))
# Iterate pages
for i in range(0, pdf.getNumPages()):
# Extract text from page and add to content
content += pdf.getPage(i).extractText()
# Newline implementation
content = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", content)
pages += content
# Removes whitespace
content = " ".join(pages.replace(u"\xa0", " ").strip().split())
# Delete the pdf file in files/ folder
to_be_del = os.getcwd()+app['UPLOAD_FOLDER'] + '' + filename
os.remove(to_be_del)
return content
def _get_text_content(input_file):
'''
Processes the txt content
@param filename - name of the uploaded file
'''
new_text_file = ""
# Opens the text file
text_file = open(input_file,'r')
# Reads the text file
read_text_file = text_file.read()
# Newline implementation
read_text_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", read_text_file)
new_text_file = read_text_file
# Delete the text file in files/ folder
os.remove(input_file)
return new_text_file
def _get_docx_content(input_file):
'''
Processes the txt content
@param filename - name of the uploaded file
'''
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import zipfile
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
document = zipfile.ZipFile(input_file)
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
paragraphs = []
for paragraph in tree.getiterator(PARA):
texts = [node.text
for node in paragraph.getiterator(TEXT)
if node.text]
if texts:
paragraphs.append(''.join(texts))
new_docx_file = '\n\n'.join(paragraphs)
new_docx_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", new_docx_file)
# Delete the text file in files/ folder
os.remove(input_file)
return new_docx_file
def _allowed_file(filename):
'''
Return whether it's an allowed type or not
@param filename - name of the uploaded file
'''
return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment