Last active
April 23, 2023 19:15
-
-
Save justengel/dff226eb565ee19a9552a8f2e76e5612 to your computer and use it in GitHub Desktop.
Convert Microsoft Office documents to html or pdf files that can be viewed in a web browser.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Convert office document to file formats that may be visible in a web browser. This file uses microsoft office to | |
convert the files, so Windows OS is assumed and required! | |
Requirements: | |
* pywin32>=228 # Not available for Python3.8 at this time | |
Server Requirements: | |
* uvicorn>=0.11.5 | |
* fastapi>=0.58.0 | |
* python-multipart>=0.0.5 | |
* aiofiles>=0.5.0 | |
Client Requirements: | |
* requests>=2.24.0 | |
""" | |
import os | |
import asyncio | |
from typing import Callable | |
import pythoncom | |
from win32com.client.gencache import EnsureDispatch # pip install pywin32 Not available for python3.8 at this time | |
from win32com.client import constants | |
from urllib.request import pathname2url, url2pathname | |
import shutil | |
__all__ = ['register_converter', 'get_converter', 'file_to_html', 'file_to_html_async', | |
'word_to_html', 'excel_to_html', 'powerpoint_to_pdf', 'copy_pdf', | |
'get_app', 'convert_client',] | |
CONVERTERS = {} | |
def register_converter(ext: str, func: Callable[[str, str], str] = None): | |
if func is None: | |
def decorator(func: Callable[[str, str], str] = None): | |
return register_converter(ext, func) | |
return decorator | |
CONVERTERS[str(ext).lower()] = func | |
return func | |
def get_converter(ext: str) -> Callable[[str, str], str]: | |
return CONVERTERS.get(str(ext).lower(), None) | |
def file_to_html(filename: str, save_filename: str) -> str: | |
ext = os.path.splitext(filename)[-1] | |
func = get_converter(ext) | |
saved = '' | |
if callable(func): | |
pythoncom.CoInitialize() | |
saved = func(filename, save_filename) | |
pythoncom.CoUninitialize() | |
return saved | |
async def file_to_html_async(filename: str, save_filename: str, loop: asyncio.AbstractEventLoop = None) -> str: | |
if loop is None: | |
loop = asyncio.get_running_loop() | |
return await loop.run_in_executor(None, file_to_html, filename, save_filename) | |
@register_converter('.docx') | |
@register_converter('.doc') | |
def word_to_html(filename: str, save_filename: str) -> str: | |
word = EnsureDispatch('Word.Application') | |
word.Visible = False | |
word.DisplayAlerts = False | |
doc = word.Documents.Open(filename) | |
# word.ActiveDocument.SaveAs(save_filename) | |
doc.SaveAs(save_filename, constants.wdFormatHTML) # wdFormatFilteredHTML | |
word.Quit() | |
return save_filename | |
@register_converter('.xlsx') | |
@register_converter('.xls') | |
def excel_to_html(filename: str, save_filename: str) -> str: | |
excel = EnsureDispatch('Excel.Application') | |
excel.Visible = False | |
excel.DisplayAlerts = False | |
wkbk = excel.Workbooks.Open(filename) | |
wkbk.SaveAs(save_filename, constants.xlHtml) | |
# excel.ActiveWorkbook.SaveAs(save_filename, constants.xlHtml) | |
excel.Quit() | |
return save_filename | |
@register_converter('.pptx') | |
def powerpoint_to_pdf(filename: str, save_filename: str) -> str: | |
# Force the save_filename to have a pdf extension. My version of office does not support HTML. | |
split = os.path.splitext(save_filename) | |
if split[-1].lower() != '.pdf': | |
save_filename = split[0] + '.pdf' | |
powerpoint = EnsureDispatch('Powerpoint.Application') | |
try: | |
powerpoint.Visible = False | |
except: | |
pass | |
powerpoint.DisplayAlerts = False | |
pres = powerpoint.Presentations.Open(filename, WithWindow=False) | |
pres.SaveAs(save_filename, constants.ppSaveAsPDF) | |
# powerpoint.ActivePresentation.SaveAs(save_filename, constants.ppSaveAsPDF) | |
# powerpoint.ActivePresentation.SaveCopyAs(save_filename, constants.ppSaveAsHTMLv3) # Not supported for version | |
powerpoint.Quit() | |
return save_filename | |
@register_converter('.pdf') | |
def copy_pdf(filename: str, save_filename: str) -> str: | |
# Force the save_filename to have a pdf extension | |
split = os.path.splitext(save_filename) | |
if split[-1].lower() != '.pdf': | |
save_filename = split[0] + '.pdf' | |
shutil.copyfile(filename, save_filename) | |
return save_filename | |
def delete_file(filename: str): | |
try: | |
os.remove(filename) | |
except: | |
pass | |
def get_app(save_url: str = '/converted/', save_path: str = './converted', adjust_path: Callable[[str], str] = None): | |
"""Create a fastapi app. | |
:param save_url: Static file url to access the converted files | |
:param save_path: Path to save the converted files | |
:param adjust_path: Function that takes in a path and returns a new path that may be modified. | |
:return: FastAPI applicaiton. | |
""" | |
from fastapi import FastAPI, Request, File, UploadFile, BackgroundTasks | |
from fastapi.staticfiles import StaticFiles | |
from fastapi.responses import HTMLResponse, FileResponse, RedirectResponse | |
app = FastAPI() | |
app.mount(save_url, StaticFiles(directory=save_path)) | |
@app.get('/') | |
async def list_convert(request: Request): | |
html = '<h1><a href="{base_url}">Office to HTML</a></h1>' \ | |
'<p><b>File Converter:</b> <a href="{base_url}file_convert/">{base_url}file_convert/</a><p>' \ | |
'<p><b>API Converter:</b> ' \ | |
'{base_url}convert/?filename=server_file_path&save_filename=save_file_path</a><br>' \ | |
'This uses a servers filepath (samba) to convert files.'\ | |
'<p>'.format(base_url=request.base_url) | |
li = '<p><b>Converted Files:</b>' \ | |
'<ul>{}</ul>' \ | |
'</p>'.format('\n'.join(('<li><a href="{}">{}</a></li>'.format(os.path.join(save_url, name), name) | |
for name in os.listdir(save_path) if '.' in name))) | |
return HTMLResponse(html+li) | |
@app.get('/convert/') | |
async def convert(filename: str = None, save_filename: str = None): | |
filename = url2pathname(filename) | |
if save_filename is None: | |
save_filename = os.path.join(save_path, os.path.splitext(os.path.basename(filename))[0] + '.html') | |
save_filename = url2pathname(save_filename) | |
if adjust_path: | |
filename = adjust_path(filename) | |
save_filename = adjust_path(save_filename) | |
saved = await file_to_html_async(filename, save_filename) | |
if saved: | |
saved = pathname2url(saved.replace(save_path, save_url)) | |
return {'results': saved} # Return the staticfile url | |
@app.route('/file_convert/', methods=['GET', 'POST']) | |
async def file_convert(request: Request): | |
tasks = BackgroundTasks() | |
msg: str = '' | |
form = await request.form() | |
file = form.get('file', None) | |
keep_converted = form.get('keep_converted', False) | |
if file is not None: | |
# Get the save filename | |
save_filename = os.path.join(save_path, os.path.splitext(os.path.basename(file.filename))[0] + '.html') | |
try: | |
# (Windows Issue) Cannot read from temp location? Move file AppData/local/Temp -> AppData | |
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(file.file.name))) | |
except (TypeError, ValueError, Exception): | |
# Data is in memory. Temporary file not created. | |
parent_dir = save_path | |
fname = os.path.join(parent_dir, file.filename) | |
# Possibly adjust the path | |
if adjust_path: | |
fname = adjust_path(fname) | |
save_filename = adjust_path(save_filename) | |
# Save in memory file to disk | |
# Also (Windows Issue) Cannot read from temp location? Move file AppData/local/Temp -> AppData | |
with open(fname, 'wb') as f: | |
shutil.copyfileobj(file.file, f) | |
delete_file(file.file.name) | |
# Save the file | |
try: | |
saved = await file_to_html_async(fname, save_filename) | |
except(ValueError, TypeError, Exception): | |
saved = '' | |
delete_file(fname) | |
if saved: | |
if not keep_converted: | |
tasks.add_task(delete_file, saved) # Delete after FileResponse | |
return FileResponse(saved, background=tasks) | |
# Failed to convert! | |
msg = '<p>Failed to convert the given file!</p>\n' | |
# GET to convert file | |
html = '<h1><a href="{base_url}">Office to HTML</a></h1>' \ | |
'{msg}' \ | |
'<form method="POST" enctype="multipart/form-data">' \ | |
' <label for="file">Select a file (.xls to html is not supported on all browsers):</label><br>' \ | |
' <input type="file" id="file" name="file"><br><br>' \ | |
' <input type="checkbox" id="keep_converted" name="keep_converted">' \ | |
' <label for="keep_converted">Keep Converted</label><br><br>' \ | |
' <input type="submit">' \ | |
'</form'.format(msg=msg, base_url=request.base_url) | |
return HTMLResponse(html) | |
return app | |
def convert_client(filename: str, save_filename: str = None, url='http://127.0.0.1:9001/convert/') -> dict: | |
import requests # Should be at top of file, but this makes requests optional if you are not running the client | |
params = {'filename': pathname2url(filename)} | |
if save_filename: | |
params['save_filename'] = pathname2url(save_filename) | |
r = requests.get(url, params=params) | |
try: | |
d = r.json() | |
except (AttributeError, ValueError, TypeError, Exception): | |
d = {'results': ''} | |
print(r.status_code, d) | |
return d | |
if __name__ == '__main__': | |
import uvicorn | |
app = get_app() | |
uvicorn.run(app, host='0.0.0.0', port=9001) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment