DESCRIPTION = \ ''' #-------------------------------------------------------------------------------------------------- Convert Microsoft Word and Excel files from one file format to another. #-------------------------------------------------------------------------------------------------- ''' EPILOG = \ ''' #-------------------------------------------------------------------------------------------------- This script automates the Microsoft Word and Excel applications to open a Word or Excel file in one format, and save it in another. A common use case might be to convert Word documents in the older 97-2003 binary '.doc' format to the newer Open XML '.docx' file format. The script required >= python 3.5 The script recursively scans from a base directory and writes out each file in the new format, to the same directory as the original source file. e.g. C:\\foo\\a\\word_document_1.doc <-- Original C:\\foo\\a\\word_document_1.docx <-- New C:\\foo\\a\\b\word_document_2.doc <-- Original C:\\foo\\a\\b\\word_document_2.docx <-- New ## Study the formats in the links below to understand which output formats can be selected. https://docs.microsoft.com/en-us/office/vba/api/word.wdsaveformat https://docs.microsoft.com/en-us/office/vba/api/excel.xlfileformat ## Example script use # Convert from Word '.doc' to '.docx' format, scan starting from the user HOME directory. # Converting from '.doc' to '.docx', starting from the HOME directory is the script default setting. python.exe microsoft_doc_converter.py # Same as above, except as a Dryrun. In Dryrun mode the files to be converted are just listed, # not converted. python.exe microsoft_doc_converter.py --dryrun # Convert from Word '.doc' to '.docx' format, scan starting from the C:\\foo directory. python.exe microsoft_doc_converter.py --basedir "C:\\foo" # Convert from Word '.docx' to '.pdf' format, scan starting from the C:\\foo directory. python.exe microsoft_doc_converter.py -b "C:\\foo" -s "docx" -d "pdf" -f 17 # Convert from Excel '.xls' to '.xlsx' format, scan starting from the C:\\bar directory. python.exe microsoft_doc_converter.py -c "Excel.application" -b "C:\\bar" -s "xls" -d "xlsx" -f 51 #-------------------------------------------------------------------------------------------------- ## !! BIG FAT WARNING !! File conversions can result in changed/lost content and formatting. As always, backups and lots of testing is advised. #-------------------------------------------------------------------------------------------------- # Briefly tested, May 3rd 2020, using, - Anaconda3-2020.02-Windows-x86_64.exe (Python 3.7.6) - Windows 10 Enterprise 1909 - Microsoft Office 365 ProPlus, Version 1908 #-------------------------------------------------------------------------------------------------- ''' #-------------------------------------------------------------------------------------------------- __author__ = 'Dave Coutts' __license__ = 'Apache' __version__ = '1.0.0' __maintainer__ = 'https://github.com/davecoutts' __status__ = 'Production' #-------------------------------------------------------------------------------------------------- import win32com.client from pathlib import Path #-------------------------------------------------------------------------------------------------- def converter(comObject, dirPath, sourceExtension, destinationExtension, fileFormat, dryRun=False): msApp = win32com.client.Dispatch(comObject) for sourceFile in sorted(dirPath.rglob(f'*.{sourceExtension}')): destinationFile = sourceFile.with_suffix(f'.{destinationExtension}') if not destinationFile.is_file(): print(f'Converting: {sourceFile}') if not dryRun: try: if comObject == 'Excel.application': doc = msApp.Workbooks.Open(str(sourceFile)) elif comObject == 'Word.application': doc = msApp.Documents.Open(str(sourceFile)) doc.SaveAs(str(destinationFile), FileFormat = fileFormat) doc.Close() except Exception as e: print(f'Failed to Convert: {sourceFile} : {e}') msApp.Quit() return #-------------------------------------------------------------------------------------------------- def main(): import argparse parser = argparse.ArgumentParser( epilog=EPILOG, description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument('-c', '--comobject', dest='comobject', type=str, default='Word.application', help="COM Object name of the application to be called. 'Word.application' or 'Excel.application'. Default, 'Word.application'." ) parser.add_argument('-b', '--basedir', dest='basedir', type=Path, default=Path.home(), help='Directory to start the recursive scan from. Default, users HOME directory' ) parser.add_argument('-s', '--srcext', dest='sourceextension', type=str, default='doc', help="File extension of the source files to be converted. Default, 'doc'." ) parser.add_argument('-d', '--destext', dest='destinationextension', type=str, default='docx', help="File extension of the resulting converted file. Default, 'docx'." ) parser.add_argument('-f', '--filefmt', dest='fileformat', type=int, default=16, help="Microsoft file format number of the output format. Default, 16." ) parser.add_argument('--dryrun', dest='dryrun', action="store_true", default=False, help='Print out all files to be converter but do not carry out the actual conversion.' ) args = parser.parse_args() converter( comObject=args.comobject, dirPath=args.basedir, sourceExtension=args.sourceextension, destinationExtension=args.destinationextension, fileFormat=args.fileformat, dryRun=args.dryrun ) #-------------------------------------------------------------------------------------------------- if __name__ == '__main__': main() #--------------------------------------------------------------------------------------------------