branflake2267 · June 17, 2025 14:33 · branflake2267 · Jun 17, 2025
diff --git a/README.md b/README.md
diff --git a/document.md b/document.md
diff --git a/export-and-convert.py b/export-and-convert.py
 #!/usr/bin/env python3
 """
 Google Docs Export and Markdown Conversion Tool

 Single-command solution that exports Google Documents to JSON format and converts 
 everything to clean Markdown format with direct image links.

 Prerequisites:
    pip install google-auth google-api-python-client

 Quick Usage:
    python export-and-convert.py --document-id YOUR_DOC_ID

 Supported Google Docs Elements:
 ✅ Text formatting: bold, italic, strikethrough, underline, superscript, subscript
 ✅ Text styling: colors, fonts, font sizes, small caps
 ✅ Headings: H1-H6 with proper Markdown conversion
 ✅ Links: URLs and rich links
 ✅ Images: inline images with direct URL linking
 ✅ Tables: full table support with images in cells
 ✅ Lists: bullet points and numbered lists with nesting
 ✅ Breaks: page breaks, column breaks, section breaks, horizontal rules
 ✅ Special elements: footnotes, equations, auto-text, @ mentions
 ✅ Document structure: tabs, table of contents

 Features:
 - Direct image linking (no downloads required)
 - Professional logging with performance metrics  
 - CLI interface with full customization options
 - Comprehensive error handling

 Output:
 - output/document.json: Complete document structure with image references
 - output/document.md: Clean Markdown version with direct image links
 - output/export.log: Detailed execution log

 For complete documentation, usage examples, troubleshooting, and advanced configuration 
 options, see USAGE_GUIDE.md and README.md.
 """

 from google.oauth2.service_account import Credentials
 from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 import json
 import re
 import os
 from typing import Dict, List, Tuple, Optional
 from pathlib import Path
 import time
 import argparse
 import logging
 from dataclasses import dataclass

 # Configure logging - will be set up dynamically in main()
 logger = logging.getLogger(__name__)

 @dataclass
 class Config:
    """Configuration class for the export and conversion tool."""
    document_id: str
    service_account_file: str = 'service-account-key.json'
    output_base_dir: str = 'output'
    output_json: str = 'document.json'
    output_markdown: str = 'document.md'

 # Default configuration
 SCOPES = [
    'https://www.googleapis.com/auth/documents.readonly',
    'https://www.googleapis.com/auth/drive.readonly'
 ]

 class MarkdownConverter:
    """Handles conversion from Google Docs JSON to Markdown format."""
    
    def __init__(self, image_mappings: Dict[str, str]):
        self.image_mappings = image_mappings  # Now contains object_id -> contentUri mappings
    
    def convert_text_style_to_markdown(self, text: str, text_style: Dict) -> str:
        """Convert Google Docs text style to Markdown formatting."""
        
        # Check if this text is a link first
        link_url = None
        if 'link' in text_style:
            link_url = text_style['link'].get('url', '')
        
        # Apply formatting to the text content
        result = text
        
        # Handle baseline offset (superscript/subscript)
        baseline_offset = text_style.get('baselineOffset', '')
        if baseline_offset == 'SUPERSCRIPT':
            result = f"<sup>{result}</sup>"
        elif baseline_offset == 'SUBSCRIPT':
            result = f"<sub>{result}</sub>"
        
        # Bold text
        if text_style.get('bold'):
            result = f"**{result}**"
        
        # Italic text
        if text_style.get('italic'):
            result = f"*{result}*"
        
        # Strikethrough
        if text_style.get('strikethrough'):
            result = f"~~{result}~~"
        
        # Underline (not standard markdown, but we'll note it)
        if text_style.get('underline'):
            result = f"<u>{result}</u>"
        
        # Small caps
        if text_style.get('smallCaps'):
            result = f"<span style=\"font-variant: small-caps\">{result}</span>"
        
        # Handle text colors (foreground and background)
        fg_color = text_style.get('foregroundColor', {}).get('color', {})
        bg_color = text_style.get('backgroundColor', {}).get('color', {})
        
        if fg_color or bg_color:
            style_parts = []
            if fg_color and 'rgbColor' in fg_color:
                rgb = fg_color['rgbColor']
                r = int(rgb.get('red', 0) * 255)
                g = int(rgb.get('green', 0) * 255) 
                b = int(rgb.get('blue', 0) * 255)
                style_parts.append(f"color: rgb({r},{g},{b})")
            
            if bg_color and 'rgbColor' in bg_color:
                rgb = bg_color['rgbColor']
                r = int(rgb.get('red', 0) * 255)
                g = int(rgb.get('green', 0) * 255)
                b = int(rgb.get('blue', 0) * 255)
                style_parts.append(f"background-color: rgb({r},{g},{b})")
            
            if style_parts:
                style_attr = "; ".join(style_parts)
                result = f"<span style=\"{style_attr}\">{result}</span>"
        
        # Handle font size
        font_size = text_style.get('fontSize', {})
        if font_size and 'magnitude' in font_size:
            size = font_size['magnitude']
            if size != 11:  # Default size, skip if normal
                result = f"<span style=\"font-size: {size}pt\">{result}</span>"
        
        # Handle font family
        font_family = text_style.get('weightedFontFamily', {})
        if font_family and 'fontFamily' in font_family:
            family = font_family['fontFamily']
            result = f"<span style=\"font-family: {family}\">{result}</span>"
        
        # Apply link formatting LAST, wrapping the formatted text
        if link_url:
            # Clean up the text content for link display
            clean_text = self._clean_text_for_link(result)
            result = f"[{clean_text}]({link_url})"
        
        return result
    
    def _clean_text_for_link(self, text: str) -> str:
        """Clean up text content for use in markdown links."""
        # For links, we want clean text without complex HTML styling
        # but preserve basic markdown formatting
        
        # Remove complex style spans but keep the inner text
        text = re.sub(r'<span style="[^"]*">([^<]*)</span>', r'\1', text)
        
        # Convert HTML formatting to markdown equivalents for link text
        # Keep it simple - complex formatting in links can be problematic
        text = re.sub(r'<u>([^<]*)</u>', r'\1', text)  # Remove underline (redundant in links)
        text = re.sub(r'<sup>([^<]*)</sup>', r'^\1', text)  # Superscript
        text = re.sub(r'<sub>([^<]*)</sub>', r'_\1', text)  # Subscript
        
        # Clean up any remaining HTML tags (keep text content only)
        text = re.sub(r'<[^>]+>', '', text)
        
        return text.strip()
    
    def get_heading_level(self, named_style_type: str) -> str:
        """Convert Google Docs heading style to Markdown heading."""
        heading_map = {
            'HEADING_1': '#',
            'HEADING_2': '##',
            'HEADING_3': '###',
            'HEADING_4': '####',
            'HEADING_5': '#####',
            'HEADING_6': '######'
        }
        return heading_map.get(named_style_type, '')
    
    def process_paragraph(self, paragraph: Dict, inline_objects: Dict) -> str:
        """Process a paragraph element and convert to Markdown."""
        elements = paragraph.get('elements', [])
        paragraph_style = paragraph.get('paragraphStyle', {})
        
        # Check if this is a heading
        named_style_type = paragraph_style.get('namedStyleType', '')
        heading_prefix = self.get_heading_level(named_style_type)
        
        # Process all elements in the paragraph
        text_parts = []
        
        for element in elements:
            if 'textRun' in element:
                text_run = element['textRun']
                content = text_run.get('content', '')
                text_style = text_run.get('textStyle', {})
                
                # Apply text formatting
                formatted_text = self.convert_text_style_to_markdown(content, text_style)
                text_parts.append(formatted_text)
                
            elif 'inlineObjectElement' in element:
                # Handle images with direct links
                inline_object_id = element['inlineObjectElement'].get('inlineObjectId', '')
                if inline_object_id in self.image_mappings:
                    image_url = self.image_mappings[inline_object_id]
                    # Add markdown image syntax with direct URL
                    text_parts.append(f"\n![Image]({image_url})\n")
                else:
                    text_parts.append(f"\n[Image: {inline_object_id}]\n")
                    
            elif 'pageBreak' in element:
                # Handle page breaks
                text_parts.append(f"\n\n---\n\n")
                
            elif 'columnBreak' in element:
                # Handle column breaks
                text_parts.append(f"\n\n<!-- Column Break -->\n\n")
                
            elif 'footnoteReference' in element:
                # Handle footnotes
                footnote_id = element['footnoteReference'].get('footnoteId', '')
                footnote_number = element['footnoteReference'].get('footnoteNumber', '?')
                text_parts.append(f"[^{footnote_number}]")
                
            elif 'horizontalRule' in element:
                # Handle horizontal rules
                text_parts.append(f"\n\n---\n\n")
                
            elif 'equation' in element:
                # Handle equations (convert to LaTeX if possible)
                text_parts.append(f"$$[Equation]$$")
                
            elif 'autoText' in element:
                # Handle auto text (page numbers, dates, etc.)
                auto_text_type = element['autoText'].get('type', '')
                text_parts.append(f"[{auto_text_type}]")
                
            elif 'richLink' in element:
                # Handle rich links
                rich_link = element['richLink']
                rich_link_properties = rich_link.get('richLinkProperties', {})
                url = rich_link_properties.get('uri', '')
                
                # Try to get the actual text content from the rich link
                text_style = rich_link.get('textStyle', {})
                suggested_display_text = rich_link_properties.get('title', '')
                
                # Use the suggested display text if available, otherwise use "Rich Link"
                display_text = suggested_display_text if suggested_display_text else "Rich Link"
                
                if url:
                    text_parts.append(f"[{display_text}]({url})")
                else:
                    text_parts.append(display_text)
                
            elif 'person' in element:
                # Handle @ mentions
                person = element['person']
                person_properties = person.get('personProperties', {})
                name = person_properties.get('name', 'Unknown')
                email = person_properties.get('email', '')
                if email:
                    text_parts.append(f"@{name} ({email})")
                else:
                    text_parts.append(f"@{name}")
                    
            # Handle nested elements (like in list items)
            elif 'paragraph' in element:
                nested_text = self.process_paragraph(element['paragraph'], inline_objects)
                text_parts.append(nested_text)
        
        # Combine all text parts
        combined_text = ''.join(text_parts)
        
        # Remove trailing newlines for processing
        combined_text = combined_text.rstrip('\n')
        
        # Apply heading formatting if needed
        if heading_prefix and combined_text:
            combined_text = f"{heading_prefix} {combined_text}"
        
        # Handle bullet points and numbered lists
        bullet_style = paragraph_style.get('bullet', {})
        if bullet_style:
            nesting_level = bullet_style.get('nestingLevel', 0)
            indent = "  " * nesting_level
            
            list_id = bullet_style.get('listId', '')
            glyph_type = bullet_style.get('glyphType', '')
            glyph_format = bullet_style.get('glyphFormat', '')
            
            # Determine list type based on glyph information
            if glyph_type in ['DECIMAL', 'ALPHA', 'ROMAN']:
                # Numbered list - use generic numbering
                combined_text = f"{indent}1. {combined_text}"
            elif glyph_format and any(char in glyph_format for char in ['%0', '%1', '%2']):
                # Numbered list with format
                combined_text = f"{indent}1. {combined_text}"
            else:
                # Bullet list (default)
                bullet_char = "•" if nesting_level == 0 else "◦" if nesting_level == 1 else "▪"
                combined_text = f"{indent}- {combined_text}"
        
        return combined_text
    
    def process_table(self, table: Dict) -> str:
        """Process a table element and convert to Markdown table."""
        rows = table.get('tableRows', [])
        if not rows:
            return ""
        
        markdown_rows = []
        
        for row_index, row in enumerate(rows):
            cells = row.get('tableCells', [])
            cell_contents = []
            
            for cell in cells:
                # Process cell content
                cell_text = ""
                content = cell.get('content', [])
                for item in content:
                    if 'paragraph' in item:
                        paragraph = item['paragraph']
                        elements = paragraph.get('elements', [])
                        paragraph_text = ""
                        
                        for element in elements:
                            if 'textRun' in element:
                                paragraph_text += element['textRun'].get('content', '')
                            elif 'inlineObjectElement' in element:
                                # Handle images in table cells with direct links
                                inline_object_id = element['inlineObjectElement'].get('inlineObjectId', '')
                                if inline_object_id in self.image_mappings:
                                    image_url = self.image_mappings[inline_object_id]
                                    paragraph_text += f"![Image]({image_url})"
                                else:
                                    paragraph_text += f"[Image: {inline_object_id}]"
                        
                        cell_text += paragraph_text.strip()
                
                cell_contents.append(cell_text.replace('\n', ' ').strip())
            
            # Create markdown table row
            markdown_row = "| " + " | ".join(cell_contents) + " |"
            markdown_rows.append(markdown_row)
            
            # Add header separator after first row
            if row_index == 0:
                separator = "| " + " | ".join(["---"] * len(cell_contents)) + " |"
                markdown_rows.append(separator)
        
        return "\n".join(markdown_rows)
    
    def convert_document_to_markdown(self, document: Dict) -> str:
        """Convert the entire document to Markdown."""
        title = document.get('title', 'Untitled Document')
        
        # Start with document title
        markdown_lines = [f"# {title}\n"]
        
        # Process tabs
        tabs = document.get('tabs', [])
        
        for tab in tabs:
            document_tab = tab.get('documentTab', {})
            body = document_tab.get('body', {})
            content = body.get('content', [])
            inline_objects = document_tab.get('inlineObjects', {})
            
            # Add tab title if there are multiple tabs
            if len(tabs) > 1:
                tab_title = tab.get('tabProperties', {}).get('title', 'Tab')
                markdown_lines.append(f"\n## {tab_title}\n")
            
            for item in content:
                if 'paragraph' in item:
                    paragraph_text = self.process_paragraph(item['paragraph'], inline_objects)
                    if paragraph_text.strip():
                        markdown_lines.append(paragraph_text)
                        markdown_lines.append("")  # Add blank line after paragraph
                
                elif 'table' in item:
                    table_text = self.process_table(item['table'])
                    if table_text.strip():
                        markdown_lines.append(table_text)
                        markdown_lines.append("")  # Add blank line after table
                
                elif 'sectionBreak' in item:
                    # Add a section break
                    markdown_lines.append("---\n")
                    
                elif 'pageBreak' in item:
                    # Handle document-level page breaks
                    markdown_lines.append("\n---\n")
                    
                elif 'tableOfContents' in item:
                    # Handle table of contents
                    markdown_lines.append("<!-- Table of Contents -->\n")
                    
                elif 'footnote' in item:
                    # Handle footnote definitions
                    footnote = item['footnote']
                    footnote_id = footnote.get('footnoteId', '')
                    content = footnote.get('content', [])
                    footnote_text = ""
                    for footnote_item in content:
                        if 'paragraph' in footnote_item:
                            footnote_text += self.process_paragraph(footnote_item['paragraph'], inline_objects)
                    markdown_lines.append(f"[^{footnote_id}]: {footnote_text.strip()}\n")
        
        # Clean up extra blank lines
        result = "\n".join(markdown_lines)
        result = re.sub(r'\n{3,}', '\n\n', result)  # Replace multiple newlines with double newlines
        
        return result

 class CompleteGoogleDocsProcessor:
    """Complete processor that handles export and markdown conversion with direct image links."""
    
    def __init__(self, config: Config):
        self.config = config
        self.credentials = self._get_credentials()
        self.docs_service = build('docs', 'v1', credentials=self.credentials)
        self.image_mappings = {}
        
        
    def _get_credentials(self) -> Credentials:
        """Authenticate using service account credentials with validation."""
        try:
            if not os.path.exists(self.config.service_account_file):
                raise FileNotFoundError(f"Service account file not found: {self.config.service_account_file}")
            
            credentials = Credentials.from_service_account_file(
                self.config.service_account_file, scopes=SCOPES
            )
            logger.info("Successfully authenticated with Google APIs")
            return credentials
        except Exception as e:
            logger.error(f"Authentication failed: {e}")
            raise
    
    def _ensure_output_directory(self) -> None:
        """Create output directory if it doesn't exist."""
        # Create base output directory
        base_dir = Path(self.config.output_base_dir)
        base_dir.mkdir(parents=True, exist_ok=True)
        
        logger.info(f"Output base directory ready: {base_dir}")
    
    
    def _extract_inline_objects(self, document: Dict) -> Dict[str, Dict]:
        """Extract all inline objects from document structure."""
        inline_objects = {}
        
        # Check document tabs
        tabs = document.get('tabs', [])
        for tab in tabs:
            document_tab = tab.get('documentTab', {})
            tab_inline_objects = document_tab.get('inlineObjects', {})
            inline_objects.update(tab_inline_objects)
        
        # Fallback: check document level (older format)
        if not inline_objects:
            inline_objects = document.get('inlineObjects', {})
        
        return inline_objects
    
    def _create_image_mappings(self, inline_objects: Dict) -> Dict[str, str]:
        """Create mappings from inline object IDs to their contentUri for direct linking."""
        image_mappings = {}
        
        for object_id, inline_object in inline_objects.items():
            embedded_object = inline_object.get('inlineObjectProperties', {}).get('embeddedObject', {})
            if 'imageProperties' in embedded_object:
                content_uri = embedded_object['imageProperties'].get('contentUri')
                if content_uri:
                    image_mappings[object_id] = content_uri
                    logger.info(f"Mapped image {object_id} to direct URL")
        
        if not image_mappings:
            logger.info("No images found in document")
        else:
            logger.info(f"Created direct links for {len(image_mappings)} images")
        
        return image_mappings
    
    def _enhance_document_with_mappings(self, document: Dict, image_mappings: Dict[str, str]) -> None:
        """Add image mappings to document JSON."""
        # Add top-level mappings
        document['imageMappings'] = image_mappings
        
        # Enhance inline objects with direct URL references
        inline_objects = self._extract_inline_objects(document)
        for object_id, inline_object in inline_objects.items():
            if object_id in image_mappings:
                embedded_object = inline_object.get('inlineObjectProperties', {}).get('embeddedObject', {})
                if 'imageProperties' in embedded_object:
                    embedded_object['imageProperties']['directUrl'] = image_mappings[object_id]
    
    def export_and_convert_document(self) -> bool:
        """Main function that exports document and converts to markdown."""
        try:
            # Setup
            self._ensure_output_directory()
            
            # Fetch document
            logger.info(f"Fetching document: {self.config.document_id}")
            start_time = time.time()
            
            document = self.docs_service.documents().get(
                documentId=self.config.document_id,
                includeTabsContent=True
            ).execute()
            
            fetch_time = time.time() - start_time
            logger.info(f"Document fetched successfully in {fetch_time:.2f} seconds")
            
            # Extract images and create direct link mappings
            inline_objects = self._extract_inline_objects(document)
            logger.info(f"Found {len(inline_objects)} inline objects")
            
            mapping_start = time.time()
            self.image_mappings = self._create_image_mappings(inline_objects)
            mapping_time = time.time() - mapping_start
            
            # Enhance document with mappings
            self._enhance_document_with_mappings(document, self.image_mappings)
            
            # Save enhanced JSON
            json_start = time.time()
            output_json_path = Path(self.config.output_base_dir) / self.config.output_json
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(document, f, indent=2, ensure_ascii=False)
            json_time = time.time() - json_start
            
            # Convert to Markdown
            markdown_start = time.time()
            converter = MarkdownConverter(self.image_mappings)
            markdown_content = converter.convert_document_to_markdown(document)
            
            # Save Markdown
            output_md_path = Path(self.config.output_base_dir) / self.config.output_markdown
            with open(output_md_path, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
            markdown_time = time.time() - markdown_start
            
            # Summary
            total_time = time.time() - start_time
            linked_images = len(self.image_mappings)
            total_images = len(inline_objects)
            markdown_lines = len(markdown_content.split('\n'))
            
            logger.info(f"Export and conversion completed successfully!")
            logger.info(f"Total time: {total_time:.2f} seconds")
            logger.info(f"Document fetch: {fetch_time:.2f} seconds")
            logger.info(f"Image mapping: {mapping_time:.2f} seconds")
            logger.info(f"JSON export: {json_time:.2f} seconds")
            logger.info(f"Markdown conversion: {markdown_time:.2f} seconds")
            logger.info(f"Images linked: {linked_images}/{total_images}")
            logger.info(f"Generated {markdown_lines} lines of Markdown")
            logger.info(f"Output JSON: {output_json_path}")
            logger.info(f"Output Markdown: {output_md_path}")
            logger.info(f"Log file: {Path(self.config.output_base_dir) / 'export.log'}")
            
            if linked_images < total_images:
                logger.warning(f"Some images could not be linked ({total_images - linked_images} failed)")
            
            return True
            
        except HttpError as e:
            logger.error(f"Google API error: {e}")
            return False
        except Exception as e:
            logger.error(f"Unexpected error during export: {e}")
            return False

 def setup_logging(output_base_dir: str):
    """Set up logging with output directory."""
    # Ensure output directory exists
    Path(output_base_dir).mkdir(parents=True, exist_ok=True)
    
    # Configure logging with file in output directory
    log_file = Path(output_base_dir) / 'export.log'
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ],
        force=True  # Override any existing configuration
    )

 def create_config_from_args() -> Config:
    """Create configuration from command line arguments."""
    parser = argparse.ArgumentParser(description='Export Google Docs and convert to Markdown')
    parser.add_argument('--document-id', required=True, help='Google Document ID')
    parser.add_argument('--service-account', default='service-account-key.json', 
                       help='Path to service account JSON file')
    parser.add_argument('--output-base-dir', default='output', help='Base output directory for all files')
    parser.add_argument('--output-json', default='document.json', help='Output JSON filename')
    parser.add_argument('--output-markdown', default='document.md', help='Output Markdown filename')
    
    args = parser.parse_args()
    
    return Config(
        document_id=args.document_id,
        service_account_file=args.service_account,
        output_base_dir=args.output_base_dir,
        output_json=args.output_json,
        output_markdown=args.output_markdown
    )

 def main():
    """Main function with complete export and conversion workflow."""
    try:
        # For backward compatibility, use hardcoded config if no CLI args
        import sys
        if len(sys.argv) == 1:
            # Default configuration for backward compatibility
            config = Config(
                document_id='12cWQaGSWtjTImNPQtVAuqLwg579vZoc4gFWjHCTz8Aw'
            )
        else:
            config = create_config_from_args()
        
        # Set up logging
        setup_logging(config.output_base_dir)
        
        # Run complete export and conversion
        processor = CompleteGoogleDocsProcessor(config)
        success = processor.export_and_convert_document()
        
        if success:
            logger.info("Export and conversion completed successfully!")
            return 0
        else:
            logger.error("Export and conversion failed!")
            return 1
            
    except KeyboardInterrupt:
        logger.info("Export cancelled by user")
        return 1
    except Exception as e:
        logger.error(f"Fatal error: {e}")
        return 1

 if __name__ == '__main__':
    exit(main())
Running monster
Running monster 2
Sprinting monster
Baby monster
Sketch 1
Sketch 2
3D sketch
3D sketch 2
Game wireframe
Game overlay
Game overlay 2
Game overlay 3
Lollipop monster
Levels
Mobile play
Mobile play 2
Square mobile play
User ID	Username	Highest Score	Levels Completed	Total Playtime (Hours)	Last Login
U001	PopMaster	12,450	42	18.5	2025-06-13
U002	BubbleBlitz	9,870	35	14.2	2025-06-12
U003	MonsterMash	15,320	50	22.7	2025-06-14
U004	QuickPopper	7,560	28	10.8	2025-06-11
U005	StarBubbler	11,200	39	16.3	2025-06-13
U006	PopLegend	13,780	45	20.1	2025-06-14
Item ID	Item Name	Description	Price (USD)	Popularity Rank	Available Since
IAP01	Score Booster	Doubles points for 5 minutes	1.99	2	2025-03-01
IAP02	Time Freeze	Pauses timer for 10 seconds	2.49	3	2025-03-01
IAP03	Ad-Free Experience	Removes all ads permanently	4.99	1	2025-03-01
IAP04	Monster Skin Pack	Unlocks 5 exclusive monster skins	3.99	4	2025-04-15
IAP05	Extra Lives Bundle	Grants 10 extra lives	1.49	5	2025-03-01
IAP06	Combo Multiplier	Increases combo multiplier for 3 games	2.99	6	2025-05-10