|
#!/usr/bin/env python3 |
|
""" |
|
Google Docs Export and Markdown Conversion Tool |
|
|
|
Single-command solution that exports Google Documents to JSON format and converts |
|
everything to clean Markdown format with direct image links. |
|
|
|
Prerequisites: |
|
pip install google-auth google-api-python-client |
|
|
|
Quick Usage: |
|
python export-and-convert.py --document-id YOUR_DOC_ID |
|
|
|
Supported Google Docs Elements: |
|
✅ Text formatting: bold, italic, strikethrough, underline, superscript, subscript |
|
✅ Text styling: colors, fonts, font sizes, small caps |
|
✅ Headings: H1-H6 with proper Markdown conversion |
|
✅ Links: URLs and rich links |
|
✅ Images: inline images with direct URL linking |
|
✅ Tables: full table support with images in cells |
|
✅ Lists: bullet points and numbered lists with nesting |
|
✅ Breaks: page breaks, column breaks, section breaks, horizontal rules |
|
✅ Special elements: footnotes, equations, auto-text, @ mentions |
|
✅ Document structure: tabs, table of contents |
|
|
|
Features: |
|
- Direct image linking (no downloads required) |
|
- Professional logging with performance metrics |
|
- CLI interface with full customization options |
|
- Comprehensive error handling |
|
|
|
Output: |
|
- output/document.json: Complete document structure with image references |
|
- output/document.md: Clean Markdown version with direct image links |
|
- output/export.log: Detailed execution log |
|
|
|
For complete documentation, usage examples, troubleshooting, and advanced configuration |
|
options, see USAGE_GUIDE.md and README.md. |
|
""" |
|
|
|
from google.oauth2.service_account import Credentials |
|
from googleapiclient.discovery import build |
|
from googleapiclient.errors import HttpError |
|
import json |
|
import re |
|
import os |
|
from typing import Dict, List, Tuple, Optional |
|
from pathlib import Path |
|
import time |
|
import argparse |
|
import logging |
|
from dataclasses import dataclass |
|
|
|
# Configure logging - will be set up dynamically in main() |
|
logger = logging.getLogger(__name__) |
|
|
|
@dataclass |
|
class Config: |
|
"""Configuration class for the export and conversion tool.""" |
|
document_id: str |
|
service_account_file: str = 'service-account-key.json' |
|
output_base_dir: str = 'output' |
|
output_json: str = 'document.json' |
|
output_markdown: str = 'document.md' |
|
|
|
# Default configuration |
|
SCOPES = [ |
|
'https://www.googleapis.com/auth/documents.readonly', |
|
'https://www.googleapis.com/auth/drive.readonly' |
|
] |
|
|
|
class MarkdownConverter: |
|
"""Handles conversion from Google Docs JSON to Markdown format.""" |
|
|
|
def __init__(self, image_mappings: Dict[str, str]): |
|
self.image_mappings = image_mappings # Now contains object_id -> contentUri mappings |
|
|
|
def convert_text_style_to_markdown(self, text: str, text_style: Dict) -> str: |
|
"""Convert Google Docs text style to Markdown formatting.""" |
|
|
|
# Check if this text is a link first |
|
link_url = None |
|
if 'link' in text_style: |
|
link_url = text_style['link'].get('url', '') |
|
|
|
# Apply formatting to the text content |
|
result = text |
|
|
|
# Handle baseline offset (superscript/subscript) |
|
baseline_offset = text_style.get('baselineOffset', '') |
|
if baseline_offset == 'SUPERSCRIPT': |
|
result = f"<sup>{result}</sup>" |
|
elif baseline_offset == 'SUBSCRIPT': |
|
result = f"<sub>{result}</sub>" |
|
|
|
# Bold text |
|
if text_style.get('bold'): |
|
result = f"**{result}**" |
|
|
|
# Italic text |
|
if text_style.get('italic'): |
|
result = f"*{result}*" |
|
|
|
# Strikethrough |
|
if text_style.get('strikethrough'): |
|
result = f"~~{result}~~" |
|
|
|
# Underline (not standard markdown, but we'll note it) |
|
if text_style.get('underline'): |
|
result = f"<u>{result}</u>" |
|
|
|
# Small caps |
|
if text_style.get('smallCaps'): |
|
result = f"<span style=\"font-variant: small-caps\">{result}</span>" |
|
|
|
# Handle text colors (foreground and background) |
|
fg_color = text_style.get('foregroundColor', {}).get('color', {}) |
|
bg_color = text_style.get('backgroundColor', {}).get('color', {}) |
|
|
|
if fg_color or bg_color: |
|
style_parts = [] |
|
if fg_color and 'rgbColor' in fg_color: |
|
rgb = fg_color['rgbColor'] |
|
r = int(rgb.get('red', 0) * 255) |
|
g = int(rgb.get('green', 0) * 255) |
|
b = int(rgb.get('blue', 0) * 255) |
|
style_parts.append(f"color: rgb({r},{g},{b})") |
|
|
|
if bg_color and 'rgbColor' in bg_color: |
|
rgb = bg_color['rgbColor'] |
|
r = int(rgb.get('red', 0) * 255) |
|
g = int(rgb.get('green', 0) * 255) |
|
b = int(rgb.get('blue', 0) * 255) |
|
style_parts.append(f"background-color: rgb({r},{g},{b})") |
|
|
|
if style_parts: |
|
style_attr = "; ".join(style_parts) |
|
result = f"<span style=\"{style_attr}\">{result}</span>" |
|
|
|
# Handle font size |
|
font_size = text_style.get('fontSize', {}) |
|
if font_size and 'magnitude' in font_size: |
|
size = font_size['magnitude'] |
|
if size != 11: # Default size, skip if normal |
|
result = f"<span style=\"font-size: {size}pt\">{result}</span>" |
|
|
|
# Handle font family |
|
font_family = text_style.get('weightedFontFamily', {}) |
|
if font_family and 'fontFamily' in font_family: |
|
family = font_family['fontFamily'] |
|
result = f"<span style=\"font-family: {family}\">{result}</span>" |
|
|
|
# Apply link formatting LAST, wrapping the formatted text |
|
if link_url: |
|
# Clean up the text content for link display |
|
clean_text = self._clean_text_for_link(result) |
|
result = f"[{clean_text}]({link_url})" |
|
|
|
return result |
|
|
|
def _clean_text_for_link(self, text: str) -> str: |
|
"""Clean up text content for use in markdown links.""" |
|
# For links, we want clean text without complex HTML styling |
|
# but preserve basic markdown formatting |
|
|
|
# Remove complex style spans but keep the inner text |
|
text = re.sub(r'<span style="[^"]*">([^<]*)</span>', r'\1', text) |
|
|
|
# Convert HTML formatting to markdown equivalents for link text |
|
# Keep it simple - complex formatting in links can be problematic |
|
text = re.sub(r'<u>([^<]*)</u>', r'\1', text) # Remove underline (redundant in links) |
|
text = re.sub(r'<sup>([^<]*)</sup>', r'^\1', text) # Superscript |
|
text = re.sub(r'<sub>([^<]*)</sub>', r'_\1', text) # Subscript |
|
|
|
# Clean up any remaining HTML tags (keep text content only) |
|
text = re.sub(r'<[^>]+>', '', text) |
|
|
|
return text.strip() |
|
|
|
def get_heading_level(self, named_style_type: str) -> str: |
|
"""Convert Google Docs heading style to Markdown heading.""" |
|
heading_map = { |
|
'HEADING_1': '#', |
|
'HEADING_2': '##', |
|
'HEADING_3': '###', |
|
'HEADING_4': '####', |
|
'HEADING_5': '#####', |
|
'HEADING_6': '######' |
|
} |
|
return heading_map.get(named_style_type, '') |
|
|
|
def process_paragraph(self, paragraph: Dict, inline_objects: Dict) -> str: |
|
"""Process a paragraph element and convert to Markdown.""" |
|
elements = paragraph.get('elements', []) |
|
paragraph_style = paragraph.get('paragraphStyle', {}) |
|
|
|
# Check if this is a heading |
|
named_style_type = paragraph_style.get('namedStyleType', '') |
|
heading_prefix = self.get_heading_level(named_style_type) |
|
|
|
# Process all elements in the paragraph |
|
text_parts = [] |
|
|
|
for element in elements: |
|
if 'textRun' in element: |
|
text_run = element['textRun'] |
|
content = text_run.get('content', '') |
|
text_style = text_run.get('textStyle', {}) |
|
|
|
# Apply text formatting |
|
formatted_text = self.convert_text_style_to_markdown(content, text_style) |
|
text_parts.append(formatted_text) |
|
|
|
elif 'inlineObjectElement' in element: |
|
# Handle images with direct links |
|
inline_object_id = element['inlineObjectElement'].get('inlineObjectId', '') |
|
if inline_object_id in self.image_mappings: |
|
image_url = self.image_mappings[inline_object_id] |
|
# Add markdown image syntax with direct URL |
|
text_parts.append(f"\n\n") |
|
else: |
|
text_parts.append(f"\n[Image: {inline_object_id}]\n") |
|
|
|
elif 'pageBreak' in element: |
|
# Handle page breaks |
|
text_parts.append(f"\n\n---\n\n") |
|
|
|
elif 'columnBreak' in element: |
|
# Handle column breaks |
|
text_parts.append(f"\n\n<!-- Column Break -->\n\n") |
|
|
|
elif 'footnoteReference' in element: |
|
# Handle footnotes |
|
footnote_id = element['footnoteReference'].get('footnoteId', '') |
|
footnote_number = element['footnoteReference'].get('footnoteNumber', '?') |
|
text_parts.append(f"[^{footnote_number}]") |
|
|
|
elif 'horizontalRule' in element: |
|
# Handle horizontal rules |
|
text_parts.append(f"\n\n---\n\n") |
|
|
|
elif 'equation' in element: |
|
# Handle equations (convert to LaTeX if possible) |
|
text_parts.append(f"$$[Equation]$$") |
|
|
|
elif 'autoText' in element: |
|
# Handle auto text (page numbers, dates, etc.) |
|
auto_text_type = element['autoText'].get('type', '') |
|
text_parts.append(f"[{auto_text_type}]") |
|
|
|
elif 'richLink' in element: |
|
# Handle rich links |
|
rich_link = element['richLink'] |
|
rich_link_properties = rich_link.get('richLinkProperties', {}) |
|
url = rich_link_properties.get('uri', '') |
|
|
|
# Try to get the actual text content from the rich link |
|
text_style = rich_link.get('textStyle', {}) |
|
suggested_display_text = rich_link_properties.get('title', '') |
|
|
|
# Use the suggested display text if available, otherwise use "Rich Link" |
|
display_text = suggested_display_text if suggested_display_text else "Rich Link" |
|
|
|
if url: |
|
text_parts.append(f"[{display_text}]({url})") |
|
else: |
|
text_parts.append(display_text) |
|
|
|
elif 'person' in element: |
|
# Handle @ mentions |
|
person = element['person'] |
|
person_properties = person.get('personProperties', {}) |
|
name = person_properties.get('name', 'Unknown') |
|
email = person_properties.get('email', '') |
|
if email: |
|
text_parts.append(f"@{name} ({email})") |
|
else: |
|
text_parts.append(f"@{name}") |
|
|
|
# Handle nested elements (like in list items) |
|
elif 'paragraph' in element: |
|
nested_text = self.process_paragraph(element['paragraph'], inline_objects) |
|
text_parts.append(nested_text) |
|
|
|
# Combine all text parts |
|
combined_text = ''.join(text_parts) |
|
|
|
# Remove trailing newlines for processing |
|
combined_text = combined_text.rstrip('\n') |
|
|
|
# Apply heading formatting if needed |
|
if heading_prefix and combined_text: |
|
combined_text = f"{heading_prefix} {combined_text}" |
|
|
|
# Handle bullet points and numbered lists |
|
bullet_style = paragraph_style.get('bullet', {}) |
|
if bullet_style: |
|
nesting_level = bullet_style.get('nestingLevel', 0) |
|
indent = " " * nesting_level |
|
|
|
list_id = bullet_style.get('listId', '') |
|
glyph_type = bullet_style.get('glyphType', '') |
|
glyph_format = bullet_style.get('glyphFormat', '') |
|
|
|
# Determine list type based on glyph information |
|
if glyph_type in ['DECIMAL', 'ALPHA', 'ROMAN']: |
|
# Numbered list - use generic numbering |
|
combined_text = f"{indent}1. {combined_text}" |
|
elif glyph_format and any(char in glyph_format for char in ['%0', '%1', '%2']): |
|
# Numbered list with format |
|
combined_text = f"{indent}1. {combined_text}" |
|
else: |
|
# Bullet list (default) |
|
bullet_char = "•" if nesting_level == 0 else "◦" if nesting_level == 1 else "▪" |
|
combined_text = f"{indent}- {combined_text}" |
|
|
|
return combined_text |
|
|
|
def process_table(self, table: Dict) -> str: |
|
"""Process a table element and convert to Markdown table.""" |
|
rows = table.get('tableRows', []) |
|
if not rows: |
|
return "" |
|
|
|
markdown_rows = [] |
|
|
|
for row_index, row in enumerate(rows): |
|
cells = row.get('tableCells', []) |
|
cell_contents = [] |
|
|
|
for cell in cells: |
|
# Process cell content |
|
cell_text = "" |
|
content = cell.get('content', []) |
|
for item in content: |
|
if 'paragraph' in item: |
|
paragraph = item['paragraph'] |
|
elements = paragraph.get('elements', []) |
|
paragraph_text = "" |
|
|
|
for element in elements: |
|
if 'textRun' in element: |
|
paragraph_text += element['textRun'].get('content', '') |
|
elif 'inlineObjectElement' in element: |
|
# Handle images in table cells with direct links |
|
inline_object_id = element['inlineObjectElement'].get('inlineObjectId', '') |
|
if inline_object_id in self.image_mappings: |
|
image_url = self.image_mappings[inline_object_id] |
|
paragraph_text += f"" |
|
else: |
|
paragraph_text += f"[Image: {inline_object_id}]" |
|
|
|
cell_text += paragraph_text.strip() |
|
|
|
cell_contents.append(cell_text.replace('\n', ' ').strip()) |
|
|
|
# Create markdown table row |
|
markdown_row = "| " + " | ".join(cell_contents) + " |" |
|
markdown_rows.append(markdown_row) |
|
|
|
# Add header separator after first row |
|
if row_index == 0: |
|
separator = "| " + " | ".join(["---"] * len(cell_contents)) + " |" |
|
markdown_rows.append(separator) |
|
|
|
return "\n".join(markdown_rows) |
|
|
|
def convert_document_to_markdown(self, document: Dict) -> str: |
|
"""Convert the entire document to Markdown.""" |
|
title = document.get('title', 'Untitled Document') |
|
|
|
# Start with document title |
|
markdown_lines = [f"# {title}\n"] |
|
|
|
# Process tabs |
|
tabs = document.get('tabs', []) |
|
|
|
for tab in tabs: |
|
document_tab = tab.get('documentTab', {}) |
|
body = document_tab.get('body', {}) |
|
content = body.get('content', []) |
|
inline_objects = document_tab.get('inlineObjects', {}) |
|
|
|
# Add tab title if there are multiple tabs |
|
if len(tabs) > 1: |
|
tab_title = tab.get('tabProperties', {}).get('title', 'Tab') |
|
markdown_lines.append(f"\n## {tab_title}\n") |
|
|
|
for item in content: |
|
if 'paragraph' in item: |
|
paragraph_text = self.process_paragraph(item['paragraph'], inline_objects) |
|
if paragraph_text.strip(): |
|
markdown_lines.append(paragraph_text) |
|
markdown_lines.append("") # Add blank line after paragraph |
|
|
|
elif 'table' in item: |
|
table_text = self.process_table(item['table']) |
|
if table_text.strip(): |
|
markdown_lines.append(table_text) |
|
markdown_lines.append("") # Add blank line after table |
|
|
|
elif 'sectionBreak' in item: |
|
# Add a section break |
|
markdown_lines.append("---\n") |
|
|
|
elif 'pageBreak' in item: |
|
# Handle document-level page breaks |
|
markdown_lines.append("\n---\n") |
|
|
|
elif 'tableOfContents' in item: |
|
# Handle table of contents |
|
markdown_lines.append("<!-- Table of Contents -->\n") |
|
|
|
elif 'footnote' in item: |
|
# Handle footnote definitions |
|
footnote = item['footnote'] |
|
footnote_id = footnote.get('footnoteId', '') |
|
content = footnote.get('content', []) |
|
footnote_text = "" |
|
for footnote_item in content: |
|
if 'paragraph' in footnote_item: |
|
footnote_text += self.process_paragraph(footnote_item['paragraph'], inline_objects) |
|
markdown_lines.append(f"[^{footnote_id}]: {footnote_text.strip()}\n") |
|
|
|
# Clean up extra blank lines |
|
result = "\n".join(markdown_lines) |
|
result = re.sub(r'\n{3,}', '\n\n', result) # Replace multiple newlines with double newlines |
|
|
|
return result |
|
|
|
class CompleteGoogleDocsProcessor: |
|
"""Complete processor that handles export and markdown conversion with direct image links.""" |
|
|
|
def __init__(self, config: Config): |
|
self.config = config |
|
self.credentials = self._get_credentials() |
|
self.docs_service = build('docs', 'v1', credentials=self.credentials) |
|
self.image_mappings = {} |
|
|
|
|
|
def _get_credentials(self) -> Credentials: |
|
"""Authenticate using service account credentials with validation.""" |
|
try: |
|
if not os.path.exists(self.config.service_account_file): |
|
raise FileNotFoundError(f"Service account file not found: {self.config.service_account_file}") |
|
|
|
credentials = Credentials.from_service_account_file( |
|
self.config.service_account_file, scopes=SCOPES |
|
) |
|
logger.info("Successfully authenticated with Google APIs") |
|
return credentials |
|
except Exception as e: |
|
logger.error(f"Authentication failed: {e}") |
|
raise |
|
|
|
def _ensure_output_directory(self) -> None: |
|
"""Create output directory if it doesn't exist.""" |
|
# Create base output directory |
|
base_dir = Path(self.config.output_base_dir) |
|
base_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
logger.info(f"Output base directory ready: {base_dir}") |
|
|
|
|
|
def _extract_inline_objects(self, document: Dict) -> Dict[str, Dict]: |
|
"""Extract all inline objects from document structure.""" |
|
inline_objects = {} |
|
|
|
# Check document tabs |
|
tabs = document.get('tabs', []) |
|
for tab in tabs: |
|
document_tab = tab.get('documentTab', {}) |
|
tab_inline_objects = document_tab.get('inlineObjects', {}) |
|
inline_objects.update(tab_inline_objects) |
|
|
|
# Fallback: check document level (older format) |
|
if not inline_objects: |
|
inline_objects = document.get('inlineObjects', {}) |
|
|
|
return inline_objects |
|
|
|
def _create_image_mappings(self, inline_objects: Dict) -> Dict[str, str]: |
|
"""Create mappings from inline object IDs to their contentUri for direct linking.""" |
|
image_mappings = {} |
|
|
|
for object_id, inline_object in inline_objects.items(): |
|
embedded_object = inline_object.get('inlineObjectProperties', {}).get('embeddedObject', {}) |
|
if 'imageProperties' in embedded_object: |
|
content_uri = embedded_object['imageProperties'].get('contentUri') |
|
if content_uri: |
|
image_mappings[object_id] = content_uri |
|
logger.info(f"Mapped image {object_id} to direct URL") |
|
|
|
if not image_mappings: |
|
logger.info("No images found in document") |
|
else: |
|
logger.info(f"Created direct links for {len(image_mappings)} images") |
|
|
|
return image_mappings |
|
|
|
def _enhance_document_with_mappings(self, document: Dict, image_mappings: Dict[str, str]) -> None: |
|
"""Add image mappings to document JSON.""" |
|
# Add top-level mappings |
|
document['imageMappings'] = image_mappings |
|
|
|
# Enhance inline objects with direct URL references |
|
inline_objects = self._extract_inline_objects(document) |
|
for object_id, inline_object in inline_objects.items(): |
|
if object_id in image_mappings: |
|
embedded_object = inline_object.get('inlineObjectProperties', {}).get('embeddedObject', {}) |
|
if 'imageProperties' in embedded_object: |
|
embedded_object['imageProperties']['directUrl'] = image_mappings[object_id] |
|
|
|
def export_and_convert_document(self) -> bool: |
|
"""Main function that exports document and converts to markdown.""" |
|
try: |
|
# Setup |
|
self._ensure_output_directory() |
|
|
|
# Fetch document |
|
logger.info(f"Fetching document: {self.config.document_id}") |
|
start_time = time.time() |
|
|
|
document = self.docs_service.documents().get( |
|
documentId=self.config.document_id, |
|
includeTabsContent=True |
|
).execute() |
|
|
|
fetch_time = time.time() - start_time |
|
logger.info(f"Document fetched successfully in {fetch_time:.2f} seconds") |
|
|
|
# Extract images and create direct link mappings |
|
inline_objects = self._extract_inline_objects(document) |
|
logger.info(f"Found {len(inline_objects)} inline objects") |
|
|
|
mapping_start = time.time() |
|
self.image_mappings = self._create_image_mappings(inline_objects) |
|
mapping_time = time.time() - mapping_start |
|
|
|
# Enhance document with mappings |
|
self._enhance_document_with_mappings(document, self.image_mappings) |
|
|
|
# Save enhanced JSON |
|
json_start = time.time() |
|
output_json_path = Path(self.config.output_base_dir) / self.config.output_json |
|
with open(output_json_path, 'w', encoding='utf-8') as f: |
|
json.dump(document, f, indent=2, ensure_ascii=False) |
|
json_time = time.time() - json_start |
|
|
|
# Convert to Markdown |
|
markdown_start = time.time() |
|
converter = MarkdownConverter(self.image_mappings) |
|
markdown_content = converter.convert_document_to_markdown(document) |
|
|
|
# Save Markdown |
|
output_md_path = Path(self.config.output_base_dir) / self.config.output_markdown |
|
with open(output_md_path, 'w', encoding='utf-8') as f: |
|
f.write(markdown_content) |
|
markdown_time = time.time() - markdown_start |
|
|
|
# Summary |
|
total_time = time.time() - start_time |
|
linked_images = len(self.image_mappings) |
|
total_images = len(inline_objects) |
|
markdown_lines = len(markdown_content.split('\n')) |
|
|
|
logger.info(f"Export and conversion completed successfully!") |
|
logger.info(f"Total time: {total_time:.2f} seconds") |
|
logger.info(f"Document fetch: {fetch_time:.2f} seconds") |
|
logger.info(f"Image mapping: {mapping_time:.2f} seconds") |
|
logger.info(f"JSON export: {json_time:.2f} seconds") |
|
logger.info(f"Markdown conversion: {markdown_time:.2f} seconds") |
|
logger.info(f"Images linked: {linked_images}/{total_images}") |
|
logger.info(f"Generated {markdown_lines} lines of Markdown") |
|
logger.info(f"Output JSON: {output_json_path}") |
|
logger.info(f"Output Markdown: {output_md_path}") |
|
logger.info(f"Log file: {Path(self.config.output_base_dir) / 'export.log'}") |
|
|
|
if linked_images < total_images: |
|
logger.warning(f"Some images could not be linked ({total_images - linked_images} failed)") |
|
|
|
return True |
|
|
|
except HttpError as e: |
|
logger.error(f"Google API error: {e}") |
|
return False |
|
except Exception as e: |
|
logger.error(f"Unexpected error during export: {e}") |
|
return False |
|
|
|
def setup_logging(output_base_dir: str): |
|
"""Set up logging with output directory.""" |
|
# Ensure output directory exists |
|
Path(output_base_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
# Configure logging with file in output directory |
|
log_file = Path(output_base_dir) / 'export.log' |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler(log_file), |
|
logging.StreamHandler() |
|
], |
|
force=True # Override any existing configuration |
|
) |
|
|
|
def create_config_from_args() -> Config: |
|
"""Create configuration from command line arguments.""" |
|
parser = argparse.ArgumentParser(description='Export Google Docs and convert to Markdown') |
|
parser.add_argument('--document-id', required=True, help='Google Document ID') |
|
parser.add_argument('--service-account', default='service-account-key.json', |
|
help='Path to service account JSON file') |
|
parser.add_argument('--output-base-dir', default='output', help='Base output directory for all files') |
|
parser.add_argument('--output-json', default='document.json', help='Output JSON filename') |
|
parser.add_argument('--output-markdown', default='document.md', help='Output Markdown filename') |
|
|
|
args = parser.parse_args() |
|
|
|
return Config( |
|
document_id=args.document_id, |
|
service_account_file=args.service_account, |
|
output_base_dir=args.output_base_dir, |
|
output_json=args.output_json, |
|
output_markdown=args.output_markdown |
|
) |
|
|
|
def main(): |
|
"""Main function with complete export and conversion workflow.""" |
|
try: |
|
# For backward compatibility, use hardcoded config if no CLI args |
|
import sys |
|
if len(sys.argv) == 1: |
|
# Default configuration for backward compatibility |
|
config = Config( |
|
document_id='12cWQaGSWtjTImNPQtVAuqLwg579vZoc4gFWjHCTz8Aw' |
|
) |
|
else: |
|
config = create_config_from_args() |
|
|
|
# Set up logging |
|
setup_logging(config.output_base_dir) |
|
|
|
# Run complete export and conversion |
|
processor = CompleteGoogleDocsProcessor(config) |
|
success = processor.export_and_convert_document() |
|
|
|
if success: |
|
logger.info("Export and conversion completed successfully!") |
|
return 0 |
|
else: |
|
logger.error("Export and conversion failed!") |
|
return 1 |
|
|
|
except KeyboardInterrupt: |
|
logger.info("Export cancelled by user") |
|
return 1 |
|
except Exception as e: |
|
logger.error(f"Fatal error: {e}") |
|
return 1 |
|
|
|
if __name__ == '__main__': |
|
exit(main()) |
Document used in experiment: https://docs.google.com/document/d/12cWQaGSWtjTImNPQtVAuqLwg579vZoc4gFWjHCTz8Aw/edit?tab=t.0#heading=h.1wbj37e80099
python export-and-convert.py --document-id 12cWQaGSWtjTImNPQtVAuqLwg579vZoc4gFWjHCTz8Aw