krectra · June 2, 2019 15:20
diff --git a/document b/document
 # coding: utf8

 import os
 import pyPdf
 import re

 from werkzeug import secure_filename


 # Path to the upload directory
 UPLOAD_FOLDER = app['UPLOAD_FOLDER']
 # Extension that are only accepted for upload
 ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'docx'])
 # Extension error message
 extension_error_message = 'Extension is not allowed.'


 def upload():
    '''
    Route that will process the file upload
    @param None
    '''
    success = False
    file = request.vars['file']
    apikey = request.get_vars['apikey'] if 'apikey' in request.get_vars else ''
    user_token = request.get_vars['token'] if 'token' in request.get_vars else ''

    # Validate apiKey and userToken
    mngo = Intmongo('volo_new').mongo
    if mngo.sessions.find({'token':user_token}).count() > 0 and apikey in volo_api_keys:
    # Get the name of the uploaded file
    # Check if the file is one of the allowed types/extensions
        if _allowed_file(file.filename):
            # Make the filename safe, remove unsupported chars
            filename = secure_filename(file.filename)
            # Move the file form the temporal folder to the files/ folder
            # Redirect the user to the uploaded_file route redirecting to the uploaded file
            open(os.getcwd()+app['UPLOAD_FOLDER'] + filename, 'wb').write(file.value)
            return redirect('/volo_api/document/files/'+filename)
        else:
            return extension_error_message

    return json.dumps({'filename':file.filename, 'success':success})



 def files():
    '''
    Locates the file on the files/ directory to the browser
    @param filename - name of the uploaded file
    '''

    is_pdf = 'pdf'
    is_text = 'txt'
    is_docx = 'docx'
    filename = request.args[0]

    # Checking if it is a pdf file
    if is_pdf in filename:
        input_file = filename

        return _get_pdf_content(input_file).encode("ascii", "ignore")

    # Checking if it is a text file
    elif is_text in filename:
        input_file = os.getcwd()+app['UPLOAD_FOLDER'] + str(filename)

        return _get_text_content(input_file)

    # Checking if it is a docx file
    elif is_docx in filename:
        input_file = os.getcwd()+app['UPLOAD_FOLDER'] +str(filename)

        return _get_docx_content(input_file)



 def _get_pdf_content(filename):
    '''
    Processes the pdf content to text
    @param filename - name of the uploaded file
    '''

    content = ""
    pages = ""

    # Load PDF into pyPDF
    path = os.getcwd()+app['UPLOAD_FOLDER'] + filename
    pdf = pyPdf.PdfFileReader(file(path, "rb"))

    # Iterate pages
    for i in range(0, pdf.getNumPages()):
        # Extract text from page and add to content
        content += pdf.getPage(i).extractText()
        # Newline implementation
        content = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", content)
        pages += content

    # Removes whitespace
    content = " ".join(pages.replace(u"\xa0", " ").strip().split())

    # Delete the pdf file in files/ folder
    to_be_del =  os.getcwd()+app['UPLOAD_FOLDER'] + '' + filename
    os.remove(to_be_del)

    return content


 def _get_text_content(input_file):
    '''
    Processes the txt content
    @param filename - name of the uploaded file
    '''

    new_text_file = ""

    # Opens the text file
    text_file = open(input_file,'r')
    # Reads the text file
    read_text_file = text_file.read()
    # Newline implementation
    read_text_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", read_text_file)
    new_text_file = read_text_file

    # Delete the text file in files/ folder
    os.remove(input_file)

    return new_text_file


 def _get_docx_content(input_file):
    '''
    Processes the txt content
    @param filename - name of the uploaded file
    '''

    try:
        from xml.etree.cElementTree import XML
    except ImportError:
        from xml.etree.ElementTree import XML
    import zipfile

    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    document = zipfile.ZipFile(input_file)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    new_docx_file = '\n\n'.join(paragraphs)
    new_docx_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", new_docx_file)

    # Delete the text file in files/ folder
    os.remove(input_file)

    return new_docx_file


 def _allowed_file(filename):
    '''
    Return whether it's an allowed type or not
    @param filename - name of the uploaded file
    '''
    return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
	# coding: utf8

	import os
	import pyPdf
	import re

	from werkzeug import secure_filename


	# Path to the upload directory
	UPLOAD_FOLDER = app['UPLOAD_FOLDER']
	# Extension that are only accepted for upload
	ALLOWED_EXTENSIONS = set(['txt', 'pdf', 'docx'])
	# Extension error message
	extension_error_message = 'Extension is not allowed.'


	def upload():
	'''
	Route that will process the file upload
	@param None
	'''
	success = False
	file = request.vars['file']
	apikey = request.get_vars['apikey'] if 'apikey' in request.get_vars else ''
	user_token = request.get_vars['token'] if 'token' in request.get_vars else ''

	# Validate apiKey and userToken
	mngo = Intmongo('volo_new').mongo
	if mngo.sessions.find({'token':user_token}).count() > 0 and apikey in volo_api_keys:
	# Get the name of the uploaded file
	# Check if the file is one of the allowed types/extensions
	if _allowed_file(file.filename):
	# Make the filename safe, remove unsupported chars
	filename = secure_filename(file.filename)
	# Move the file form the temporal folder to the files/ folder
	# Redirect the user to the uploaded_file route redirecting to the uploaded file
	open(os.getcwd()+app['UPLOAD_FOLDER'] + filename, 'wb').write(file.value)
	return redirect('/volo_api/document/files/'+filename)
	else:
	return extension_error_message

	return json.dumps({'filename':file.filename, 'success':success})



	def files():
	'''
	Locates the file on the files/ directory to the browser
	@param filename - name of the uploaded file
	'''

	is_pdf = 'pdf'
	is_text = 'txt'
	is_docx = 'docx'
	filename = request.args[0]

	# Checking if it is a pdf file
	if is_pdf in filename:
	input_file = filename

	return _get_pdf_content(input_file).encode("ascii", "ignore")

	# Checking if it is a text file
	elif is_text in filename:
	input_file = os.getcwd()+app['UPLOAD_FOLDER'] + str(filename)

	return _get_text_content(input_file)

	# Checking if it is a docx file
	elif is_docx in filename:
	input_file = os.getcwd()+app['UPLOAD_FOLDER'] +str(filename)

	return _get_docx_content(input_file)



	def _get_pdf_content(filename):
	'''
	Processes the pdf content to text
	@param filename - name of the uploaded file
	'''

	content = ""
	pages = ""

	# Load PDF into pyPDF
	path = os.getcwd()+app['UPLOAD_FOLDER'] + filename
	pdf = pyPdf.PdfFileReader(file(path, "rb"))

	# Iterate pages
	for i in range(0, pdf.getNumPages()):
	# Extract text from page and add to content
	content += pdf.getPage(i).extractText()
	# Newline implementation
	content = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", content)
	pages += content

	# Removes whitespace
	content = " ".join(pages.replace(u"\xa0", " ").strip().split())

	# Delete the pdf file in files/ folder
	to_be_del = os.getcwd()+app['UPLOAD_FOLDER'] + '' + filename
	os.remove(to_be_del)

	return content


	def _get_text_content(input_file):
	'''
	Processes the txt content
	@param filename - name of the uploaded file
	'''

	new_text_file = ""

	# Opens the text file
	text_file = open(input_file,'r')
	# Reads the text file
	read_text_file = text_file.read()
	# Newline implementation
	read_text_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", read_text_file)
	new_text_file = read_text_file

	# Delete the text file in files/ folder
	os.remove(input_file)

	return new_text_file


	def _get_docx_content(input_file):
	'''
	Processes the txt content
	@param filename - name of the uploaded file
	'''

	try:
	from xml.etree.cElementTree import XML
	except ImportError:
	from xml.etree.ElementTree import XML
	import zipfile

	WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
	PARA = WORD_NAMESPACE + 'p'
	TEXT = WORD_NAMESPACE + 't'

	document = zipfile.ZipFile(input_file)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)

	paragraphs = []
	for paragraph in tree.getiterator(PARA):
	texts = [node.text
	for node in paragraph.getiterator(TEXT)
	if node.text]
	if texts:
	paragraphs.append(''.join(texts))

	new_docx_file = '\n\n'.join(paragraphs)
	new_docx_file = re.sub(r'([^0-9A-Z])\. ', r"\1. <br />", new_docx_file)

	# Delete the text file in files/ folder
	os.remove(input_file)

	return new_docx_file


	def _allowed_file(filename):
	'''
	Return whether it's an allowed type or not
	@param filename - name of the uploaded file
	'''
	return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS