Last active
April 29, 2026 10:41
-
-
Save StoneLabs/3a413a2d851f73aedbbbe177137853df to your computer and use it in GitHub Desktop.
ODT to custom md
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import zipfile, subprocess, re, sys | |
| from pathlib import Path | |
| if len(sys.argv) < 2: | |
| print("Usage: python convert.py <input.odt>") | |
| sys.exit(1) | |
| input_path = sys.argv[1] | |
| if not Path(input_path).exists(): | |
| print(f"Error: file not found: {input_path}") | |
| sys.exit(1) | |
| if Path(input_path).suffix.lower() != '.odt': | |
| print(f"Error: expected .odt file, got: {Path(input_path).suffix}") | |
| sys.exit(1) | |
| patched = '/tmp/input_patched.odt' | |
| print(f"Reading {input_path}...") | |
| with zipfile.ZipFile(input_path) as z: | |
| content = z.read('content.xml').decode() | |
| empty_paras = len(re.findall(r'<text:p[^>]*/>', content)) | |
| print(f" -> Found {empty_paras} empty paragraph(s), patching...") | |
| content = re.sub( | |
| r'<text:p[^>]*/>', | |
| '<text:p text:style-name="Text_20_Body">EMPTY_PARA_PLACEHOLDER</text:p>', | |
| content | |
| ) | |
| print(f"Writing patched ODT to {patched}...") | |
| with zipfile.ZipFile(patched, 'w') as zout: | |
| with zipfile.ZipFile(input_path) as zin: | |
| for item in zin.infolist(): | |
| if item.filename == 'content.xml': | |
| zout.writestr(item, content) | |
| else: | |
| zout.writestr(item, zin.read(item.filename)) | |
| print(f"Running pandoc on {patched}...") | |
| print(f" -> pandoc {patched} -t markdown-smart --wrap=none") | |
| result = subprocess.run( | |
| ['pandoc', patched, '-t', 'markdown-smart', '--wrap=none'], | |
| capture_output=True, text=True | |
| ) | |
| if result.returncode != 0: | |
| print(f"Error: pandoc failed:\n{result.stderr}") | |
| sys.exit(1) | |
| print(" -> removing patched markers") | |
| output = result.stdout.replace('EMPTY_PARA_PLACEHOLDER\n', '') | |
| if 'EOFEOFEOF' in output: | |
| output = output[:output.index('EOFEOFEOF')] | |
| print(" -> Truncated output at eof marker") | |
| print("Outputting") | |
| output_dir = Path('md-export') | |
| print(f" -> Creating output folder {output_dir.resolve()}") | |
| output_dir.mkdir(exist_ok=True) | |
| output_path = output_dir / Path(input_path).with_suffix('.md').name | |
| print(f" -> Writing output to {output_path}...") | |
| with open(output_path, 'w') as f: | |
| f.write(output) | |
| print(f"Done. {len(output.splitlines())} lines written to {output_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment