-
-
Save abunimeh/5b34f395087299cfea9ff848234da61a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Math Problem Solver CLI | |
| This script extracts mathematical problems from images using vision-language models, | |
| solves them using specialized math models, and generates beautiful HTML output | |
| with MathJax rendering. | |
| Dependencies: | |
| - mlx: Apple's machine learning framework | |
| - mlx_vlm: Vision-language model support for MLX | |
| - mlx_lm: Language model support for MLX | |
| [dependencies] | |
| python = ">=3.13.7,<3.14" | |
| [pypi-dependencies] | |
| mlx = ">=0.29.1, <0.30" | |
| mlx-vlm = ">=0.3.3, <0.4" | |
| Usage: | |
| python3 math_solver_cli.py [image_path] | |
| If no image path is provided, defaults to 'ss.png' in the current directory. | |
| Output: | |
| Generates an HTML file with the extracted problem, step-by-step solution, | |
| and summary, all properly rendered with MathJax for mathematical notation. | |
| """ | |
| import os | |
| import sys | |
| import time | |
| from pathlib import Path | |
| # ML imports | |
| import mlx.core as mx | |
| from mlx_vlm import load, generate, stream_generate | |
| from mlx_vlm.prompt_utils import apply_chat_template | |
| from mlx_vlm.utils import load_config | |
| from mlx_lm import ( | |
| load as lm_load, | |
| generate as lm_generate, | |
| stream_generate as lm_stream_generate, | |
| ) | |
| class MathSolver: | |
| """ | |
| A class that orchestrates the math problem solving pipeline. | |
| This class handles: | |
| 1. Loading and managing three different ML models | |
| 2. Extracting mathematical problems from images | |
| 3. Solving the extracted problems | |
| 4. Summarizing solutions | |
| 5. Generating HTML output with MathJax rendering | |
| Attributes: | |
| model: Vision-language model for image analysis (Pixtral) | |
| processor: Image processor for the vision model | |
| config: Configuration for the vision model | |
| answer_model: Specialized math solving model (Qwen2.5-Math) | |
| answer_tokenizer: Tokenizer for the math model | |
| summarizer_model: Model for summarizing solutions (Llama-3.2) | |
| summarizer_tokenizer: Tokenizer for the summarizer | |
| """ | |
| def __init__(self): | |
| """ | |
| Initialize the MathSolver by loading all required models. | |
| This loads three models: | |
| 1. Pixtral-12B for vision-language tasks (OCR and LaTeX extraction) | |
| 2. Qwen2.5-Math-7B for solving mathematical problems | |
| 3. Llama-3.2-3B for summarizing solutions in plain language | |
| All models are loaded in 4-bit quantized versions for efficiency. | |
| """ | |
| print("Loading models...") | |
| # Load vision-language model for OCR and problem extraction | |
| # Pixtral is chosen for its strong performance on mathematical notation | |
| model_path = "mlx-community/pixtral-12b-4bit" | |
| self.model, self.processor = load(model_path) | |
| self.config = load_config(model_path) | |
| # Load math solving model | |
| # Qwen2.5-Math is specifically fine-tuned for mathematical reasoning | |
| self.answer_model, self.answer_tokenizer = lm_load( | |
| "mlx-community/Qwen2.5-Math-7B-Instruct-4bit" | |
| ) | |
| # Load summarizer model | |
| # Llama-3.2 is used for its strong instruction-following capabilities | |
| self.summarizer_model, self.summarizer_tokenizer = lm_load( | |
| "mlx-community/Llama-3.2-3B-Instruct-4bit" | |
| ) | |
| print("Models loaded successfully!") | |
| def extract_problem(self, image_path): | |
| """ | |
| Extract mathematical text and problems from an image using vision-language model. | |
| This method uses Pixtral to analyze the image and extract any mathematical | |
| content, formatting it as LaTeX. The model is instructed to extract problems | |
| without solving them. | |
| Args: | |
| image_path (str): Path to the image file containing math problems | |
| Returns: | |
| str: Extracted mathematical content in LaTeX format, or text containing | |
| '\\skip' if no math content is found | |
| Note: | |
| The extraction process streams tokens for real-time feedback, | |
| displaying dots as progress indicators. | |
| """ | |
| print("Analyzing image...") | |
| # Craft a specific prompt for mathematical extraction | |
| # The prompt explicitly asks for LaTeX formatting and problem extraction only | |
| prompt = "Extract all text, problem statements, and math notation from the image, and format the result in LaTeX. If there is a problem statement, you *must* extract it. Do NOT attempt to solve the problem. If the image doesn't contain anything math-related, output the latex \\skip." | |
| # Format the prompt according to the model's chat template | |
| formatted_prompt = apply_chat_template( | |
| self.processor, self.config, prompt, num_images=1 | |
| ) | |
| output = "" | |
| # Stream generate for real-time feedback during extraction | |
| for result in stream_generate( | |
| self.model, | |
| self.processor, | |
| formatted_prompt, | |
| image=image_path, | |
| verbose=False, | |
| max_tokens=4096, # Allow for long mathematical content | |
| ): | |
| output += result.text | |
| print(".", end="", flush=True) # Progress indicator | |
| print("\n") | |
| return output | |
| def solve_problem(self, problem_text): | |
| """ | |
| Solve the extracted mathematical problem using the specialized math model. | |
| This method uses Qwen2.5-Math to solve the problem step-by-step, | |
| emphasizing clear reasoning and simplified final answers. | |
| Args: | |
| problem_text (str): The mathematical problem in LaTeX format | |
| Returns: | |
| str: Detailed step-by-step solution with mathematical reasoning | |
| Note: | |
| The model is prompted to think step-by-step and simplify the final answer. | |
| Progress is shown via dots during generation. | |
| """ | |
| print("Solving problem...") | |
| # Create a prompt that encourages step-by-step reasoning | |
| prompt = f"Solve the following problem - ensure you simplify your answer as much as possible. Think step by step to get to your conclusion. The problem is below: \n{problem_text}" | |
| # Format as a chat conversation | |
| messages = [{"role": "user", "content": prompt}] | |
| prompt = self.answer_tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| output = "" | |
| # Stream generate the solution for real-time feedback | |
| for result in lm_stream_generate( | |
| self.answer_model, | |
| self.answer_tokenizer, | |
| prompt, | |
| max_tokens=4096, # Allow for detailed solutions | |
| ): | |
| output += result.text | |
| print(".", end="", flush=True) # Progress indicator | |
| print("\n") | |
| return output | |
| def summarize_solution(self, solution): | |
| """ | |
| Create a concise summary of the mathematical solution. | |
| This method uses Llama-3.2 to extract just the final answer from | |
| the detailed solution, formatting it in plain language suitable | |
| for quick understanding. | |
| Args: | |
| solution (str): The detailed step-by-step solution | |
| Returns: | |
| str: A concise summary of the answer in plain language | |
| Note: | |
| The system prompt instructs the model to write out negative numbers | |
| in words (e.g., "negative 2" instead of "-2") for clarity. | |
| """ | |
| print("Summarizing solution...") | |
| # Create a prompt asking for just the answer | |
| summarize_prompt = f"Below is a solved math problem - write out the answer presented without any fancy formatting: \n{solution}" | |
| # Use a system prompt to ensure proper formatting of negative numbers | |
| formatted_summarize_prompt = self.summarizer_tokenizer.apply_chat_template( | |
| [ | |
| { | |
| "role": "system", | |
| "content": "You are an AI that summarizes solutions. When doing so, you write negative numbers as 'negative 2' instead of '-2'.", | |
| }, | |
| {"role": "user", "content": summarize_prompt}, | |
| ], | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| output = "" | |
| # Stream generate the summary | |
| for result in lm_stream_generate( | |
| self.summarizer_model, | |
| self.summarizer_tokenizer, | |
| formatted_summarize_prompt, | |
| max_tokens=4096, # Usually much shorter, but allow flexibility | |
| ): | |
| output += result.text | |
| print(".", end="", flush=True) # Progress indicator | |
| print("\n") | |
| return output | |
| def generate_html(self, problem, solution, summary, output_path): | |
| """ | |
| Generate a beautifully formatted HTML file with MathJax for LaTeX rendering. | |
| This method creates an HTML document that properly displays mathematical | |
| notation using MathJax. It includes custom styling for different sections | |
| and handles cleaning of markdown artifacts from the extracted content. | |
| Args: | |
| problem (str): The extracted problem text (may contain markdown artifacts) | |
| solution (str): The detailed step-by-step solution | |
| summary (str): The concise summary of the answer | |
| output_path (str): Path where the HTML file will be saved | |
| Note: | |
| The method automatically removes markdown code fences (```latex...```) | |
| that may be present in the extracted problem text to ensure proper | |
| MathJax rendering. | |
| """ | |
| # Clean the problem text by removing markdown code fences | |
| # This is necessary because the vision model sometimes wraps LaTeX in markdown | |
| if problem.startswith("```latex"): | |
| problem = problem[8:] # Remove "```latex\n" | |
| if problem.endswith("```"): | |
| problem = problem[:-3] # Remove closing "```" | |
| problem = problem.strip() | |
| # Generate the HTML content with embedded styles and MathJax configuration | |
| html_content = f"""<!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Math Problem Solution</title> | |
| <!-- Polyfill for older browsers --> | |
| <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script> | |
| <!-- MathJax for rendering LaTeX mathematics --> | |
| <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> | |
| <style> | |
| /* Global styles */ | |
| body {{ | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| max-width: 800px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| background-color: #f5f5f5; | |
| }} | |
| /* Main container with shadow effect */ | |
| .container {{ | |
| background-color: white; | |
| padding: 30px; | |
| border-radius: 10px; | |
| box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
| }} | |
| /* Header styling */ | |
| h1 {{ | |
| color: #333; | |
| border-bottom: 2px solid #4CAF50; | |
| padding-bottom: 10px; | |
| }} | |
| h2 {{ | |
| color: #666; | |
| margin-top: 30px; | |
| }} | |
| /* Problem section with blue accent */ | |
| .problem-section {{ | |
| background-color: #f9f9f9; | |
| padding: 20px; | |
| border-left: 4px solid #2196F3; | |
| margin: 20px 0; | |
| border-radius: 5px; | |
| }} | |
| /* Solution section with green accent */ | |
| .solution-section {{ | |
| background-color: #f0f8ff; | |
| padding: 20px; | |
| border-left: 4px solid #4CAF50; | |
| margin: 20px 0; | |
| border-radius: 5px; | |
| }} | |
| /* Summary section with orange accent */ | |
| .summary-section {{ | |
| background-color: #fff9e6; | |
| padding: 20px; | |
| border-left: 4px solid #FFA726; | |
| margin: 20px 0; | |
| border-radius: 5px; | |
| font-weight: bold; | |
| }} | |
| /* Timestamp styling */ | |
| .timestamp {{ | |
| color: #999; | |
| font-size: 0.9em; | |
| text-align: right; | |
| margin-top: 20px; | |
| }} | |
| /* Math content container with overflow handling */ | |
| .math-content {{ | |
| overflow-x: auto; | |
| padding: 10px 0; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1>Math Problem Solution</h1> | |
| <div class="problem-section"> | |
| <h2>Problem</h2> | |
| <div class="math-content"> | |
| {problem} | |
| </div> | |
| </div> | |
| <div class="solution-section"> | |
| <h2>Solution</h2> | |
| <div class="math-content"> | |
| {solution} | |
| </div> | |
| </div> | |
| <div class="summary-section"> | |
| <h2>Summary</h2> | |
| <p>{summary}</p> | |
| </div> | |
| <div class="timestamp"> | |
| Generated on: {time.strftime("%Y-%m-%d %H:%M:%S")} | |
| </div> | |
| </div> | |
| </body> | |
| </html>""" | |
| # Write the HTML content to file | |
| with open(output_path, "w") as f: | |
| f.write(html_content) | |
| print(f"HTML output saved to: {output_path}") | |
| def process_image(self, image_path): | |
| """ | |
| Main processing pipeline that coordinates the entire math solving workflow. | |
| This method orchestrates the complete process from image analysis to | |
| HTML generation, including error handling and progress reporting. | |
| Args: | |
| image_path (str): Path to the image file to process | |
| Returns: | |
| str or None: The filename of the generated HTML output, or None if | |
| no math content was found or an error occurred | |
| Workflow: | |
| 1. Verify the image file exists | |
| 2. Extract mathematical content from the image | |
| 3. Check if math content was found | |
| 4. Solve the extracted problem | |
| 5. Generate a summary of the solution | |
| 6. Create an HTML file with all results | |
| """ | |
| # Validate that the image file exists | |
| if not os.path.exists(image_path): | |
| print(f"Error: Image file not found: {image_path}") | |
| return | |
| # Step 1: Extract problem from image | |
| problem = self.extract_problem(image_path) | |
| # Check if the image contains mathematical content | |
| # The vision model outputs '\skip' when no math is detected | |
| if "\\skip" in problem: | |
| print("No math content found in the image.") | |
| return | |
| # Display the extracted problem for verification | |
| print("\n=== Extracted Problem ===") | |
| print(problem) | |
| # Step 2: Solve the problem using the math model | |
| solution = self.solve_problem(problem) | |
| print("\n=== Solution ===") | |
| print(solution) | |
| # Step 3: Generate a concise summary | |
| summary = self.summarize_solution(solution) | |
| print("\n=== Summary ===") | |
| print(summary) | |
| # Step 4: Generate HTML output with timestamp in filename | |
| output_filename = f"math_solution_{time.strftime('%Y%m%d_%H%M%S')}.html" | |
| self.generate_html(problem, solution, summary, output_filename) | |
| return output_filename | |
| def main(): | |
| """ | |
| Main entry point for the Math Solver CLI application. | |
| This function handles command-line argument parsing, initializes the | |
| MathSolver instance, and processes the specified image. | |
| Command-line usage: | |
| python3 math_solver_cli.py [image_path] | |
| If no image path is provided, defaults to 'ss.png' in the current directory. | |
| The function provides user feedback throughout the process and reports | |
| the final output filename upon successful completion. | |
| """ | |
| # Parse command-line arguments | |
| if len(sys.argv) > 1: | |
| image_path = sys.argv[1] | |
| else: | |
| # Default to ss.png if no argument provided | |
| image_path = "ss.png" | |
| # Initialize the solver (loads all ML models) | |
| solver = MathSolver() | |
| # Process the image and generate output | |
| output_file = solver.process_image(image_path) | |
| # Report success if an output file was generated | |
| if output_file: | |
| print(f"\nSuccess! Open {output_file} in your browser to view the results.") | |
| if __name__ == "__main__": | |
| # Only run main() if this script is executed directly | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment