Skip to content

Instantly share code, notes, and snippets.

@abunimeh
Forked from N8python/math-assist-mlx.py
Last active September 24, 2025 04:47
Show Gist options
  • Save abunimeh/5b34f395087299cfea9ff848234da61a to your computer and use it in GitHub Desktop.
Save abunimeh/5b34f395087299cfea9ff848234da61a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Math Problem Solver CLI
This script extracts mathematical problems from images using vision-language models,
solves them using specialized math models, and generates beautiful HTML output
with MathJax rendering.
Dependencies:
- mlx: Apple's machine learning framework
- mlx_vlm: Vision-language model support for MLX
- mlx_lm: Language model support for MLX
[dependencies]
python = ">=3.13.7,<3.14"
[pypi-dependencies]
mlx = ">=0.29.1, <0.30"
mlx-vlm = ">=0.3.3, <0.4"
Usage:
python3 math_solver_cli.py [image_path]
If no image path is provided, defaults to 'ss.png' in the current directory.
Output:
Generates an HTML file with the extracted problem, step-by-step solution,
and summary, all properly rendered with MathJax for mathematical notation.
"""
import os
import sys
import time
from pathlib import Path
# ML imports
import mlx.core as mx
from mlx_vlm import load, generate, stream_generate
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config
from mlx_lm import (
load as lm_load,
generate as lm_generate,
stream_generate as lm_stream_generate,
)
class MathSolver:
"""
A class that orchestrates the math problem solving pipeline.
This class handles:
1. Loading and managing three different ML models
2. Extracting mathematical problems from images
3. Solving the extracted problems
4. Summarizing solutions
5. Generating HTML output with MathJax rendering
Attributes:
model: Vision-language model for image analysis (Pixtral)
processor: Image processor for the vision model
config: Configuration for the vision model
answer_model: Specialized math solving model (Qwen2.5-Math)
answer_tokenizer: Tokenizer for the math model
summarizer_model: Model for summarizing solutions (Llama-3.2)
summarizer_tokenizer: Tokenizer for the summarizer
"""
def __init__(self):
"""
Initialize the MathSolver by loading all required models.
This loads three models:
1. Pixtral-12B for vision-language tasks (OCR and LaTeX extraction)
2. Qwen2.5-Math-7B for solving mathematical problems
3. Llama-3.2-3B for summarizing solutions in plain language
All models are loaded in 4-bit quantized versions for efficiency.
"""
print("Loading models...")
# Load vision-language model for OCR and problem extraction
# Pixtral is chosen for its strong performance on mathematical notation
model_path = "mlx-community/pixtral-12b-4bit"
self.model, self.processor = load(model_path)
self.config = load_config(model_path)
# Load math solving model
# Qwen2.5-Math is specifically fine-tuned for mathematical reasoning
self.answer_model, self.answer_tokenizer = lm_load(
"mlx-community/Qwen2.5-Math-7B-Instruct-4bit"
)
# Load summarizer model
# Llama-3.2 is used for its strong instruction-following capabilities
self.summarizer_model, self.summarizer_tokenizer = lm_load(
"mlx-community/Llama-3.2-3B-Instruct-4bit"
)
print("Models loaded successfully!")
def extract_problem(self, image_path):
"""
Extract mathematical text and problems from an image using vision-language model.
This method uses Pixtral to analyze the image and extract any mathematical
content, formatting it as LaTeX. The model is instructed to extract problems
without solving them.
Args:
image_path (str): Path to the image file containing math problems
Returns:
str: Extracted mathematical content in LaTeX format, or text containing
'\\skip' if no math content is found
Note:
The extraction process streams tokens for real-time feedback,
displaying dots as progress indicators.
"""
print("Analyzing image...")
# Craft a specific prompt for mathematical extraction
# The prompt explicitly asks for LaTeX formatting and problem extraction only
prompt = "Extract all text, problem statements, and math notation from the image, and format the result in LaTeX. If there is a problem statement, you *must* extract it. Do NOT attempt to solve the problem. If the image doesn't contain anything math-related, output the latex \\skip."
# Format the prompt according to the model's chat template
formatted_prompt = apply_chat_template(
self.processor, self.config, prompt, num_images=1
)
output = ""
# Stream generate for real-time feedback during extraction
for result in stream_generate(
self.model,
self.processor,
formatted_prompt,
image=image_path,
verbose=False,
max_tokens=4096, # Allow for long mathematical content
):
output += result.text
print(".", end="", flush=True) # Progress indicator
print("\n")
return output
def solve_problem(self, problem_text):
"""
Solve the extracted mathematical problem using the specialized math model.
This method uses Qwen2.5-Math to solve the problem step-by-step,
emphasizing clear reasoning and simplified final answers.
Args:
problem_text (str): The mathematical problem in LaTeX format
Returns:
str: Detailed step-by-step solution with mathematical reasoning
Note:
The model is prompted to think step-by-step and simplify the final answer.
Progress is shown via dots during generation.
"""
print("Solving problem...")
# Create a prompt that encourages step-by-step reasoning
prompt = f"Solve the following problem - ensure you simplify your answer as much as possible. Think step by step to get to your conclusion. The problem is below: \n{problem_text}"
# Format as a chat conversation
messages = [{"role": "user", "content": prompt}]
prompt = self.answer_tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
output = ""
# Stream generate the solution for real-time feedback
for result in lm_stream_generate(
self.answer_model,
self.answer_tokenizer,
prompt,
max_tokens=4096, # Allow for detailed solutions
):
output += result.text
print(".", end="", flush=True) # Progress indicator
print("\n")
return output
def summarize_solution(self, solution):
"""
Create a concise summary of the mathematical solution.
This method uses Llama-3.2 to extract just the final answer from
the detailed solution, formatting it in plain language suitable
for quick understanding.
Args:
solution (str): The detailed step-by-step solution
Returns:
str: A concise summary of the answer in plain language
Note:
The system prompt instructs the model to write out negative numbers
in words (e.g., "negative 2" instead of "-2") for clarity.
"""
print("Summarizing solution...")
# Create a prompt asking for just the answer
summarize_prompt = f"Below is a solved math problem - write out the answer presented without any fancy formatting: \n{solution}"
# Use a system prompt to ensure proper formatting of negative numbers
formatted_summarize_prompt = self.summarizer_tokenizer.apply_chat_template(
[
{
"role": "system",
"content": "You are an AI that summarizes solutions. When doing so, you write negative numbers as 'negative 2' instead of '-2'.",
},
{"role": "user", "content": summarize_prompt},
],
tokenize=False,
add_generation_prompt=True,
)
output = ""
# Stream generate the summary
for result in lm_stream_generate(
self.summarizer_model,
self.summarizer_tokenizer,
formatted_summarize_prompt,
max_tokens=4096, # Usually much shorter, but allow flexibility
):
output += result.text
print(".", end="", flush=True) # Progress indicator
print("\n")
return output
def generate_html(self, problem, solution, summary, output_path):
"""
Generate a beautifully formatted HTML file with MathJax for LaTeX rendering.
This method creates an HTML document that properly displays mathematical
notation using MathJax. It includes custom styling for different sections
and handles cleaning of markdown artifacts from the extracted content.
Args:
problem (str): The extracted problem text (may contain markdown artifacts)
solution (str): The detailed step-by-step solution
summary (str): The concise summary of the answer
output_path (str): Path where the HTML file will be saved
Note:
The method automatically removes markdown code fences (```latex...```)
that may be present in the extracted problem text to ensure proper
MathJax rendering.
"""
# Clean the problem text by removing markdown code fences
# This is necessary because the vision model sometimes wraps LaTeX in markdown
if problem.startswith("```latex"):
problem = problem[8:] # Remove "```latex\n"
if problem.endswith("```"):
problem = problem[:-3] # Remove closing "```"
problem = problem.strip()
# Generate the HTML content with embedded styles and MathJax configuration
html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Math Problem Solution</title>
<!-- Polyfill for older browsers -->
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<!-- MathJax for rendering LaTeX mathematics -->
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<style>
/* Global styles */
body {{
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}}
/* Main container with shadow effect */
.container {{
background-color: white;
padding: 30px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}}
/* Header styling */
h1 {{
color: #333;
border-bottom: 2px solid #4CAF50;
padding-bottom: 10px;
}}
h2 {{
color: #666;
margin-top: 30px;
}}
/* Problem section with blue accent */
.problem-section {{
background-color: #f9f9f9;
padding: 20px;
border-left: 4px solid #2196F3;
margin: 20px 0;
border-radius: 5px;
}}
/* Solution section with green accent */
.solution-section {{
background-color: #f0f8ff;
padding: 20px;
border-left: 4px solid #4CAF50;
margin: 20px 0;
border-radius: 5px;
}}
/* Summary section with orange accent */
.summary-section {{
background-color: #fff9e6;
padding: 20px;
border-left: 4px solid #FFA726;
margin: 20px 0;
border-radius: 5px;
font-weight: bold;
}}
/* Timestamp styling */
.timestamp {{
color: #999;
font-size: 0.9em;
text-align: right;
margin-top: 20px;
}}
/* Math content container with overflow handling */
.math-content {{
overflow-x: auto;
padding: 10px 0;
}}
</style>
</head>
<body>
<div class="container">
<h1>Math Problem Solution</h1>
<div class="problem-section">
<h2>Problem</h2>
<div class="math-content">
{problem}
</div>
</div>
<div class="solution-section">
<h2>Solution</h2>
<div class="math-content">
{solution}
</div>
</div>
<div class="summary-section">
<h2>Summary</h2>
<p>{summary}</p>
</div>
<div class="timestamp">
Generated on: {time.strftime("%Y-%m-%d %H:%M:%S")}
</div>
</div>
</body>
</html>"""
# Write the HTML content to file
with open(output_path, "w") as f:
f.write(html_content)
print(f"HTML output saved to: {output_path}")
def process_image(self, image_path):
"""
Main processing pipeline that coordinates the entire math solving workflow.
This method orchestrates the complete process from image analysis to
HTML generation, including error handling and progress reporting.
Args:
image_path (str): Path to the image file to process
Returns:
str or None: The filename of the generated HTML output, or None if
no math content was found or an error occurred
Workflow:
1. Verify the image file exists
2. Extract mathematical content from the image
3. Check if math content was found
4. Solve the extracted problem
5. Generate a summary of the solution
6. Create an HTML file with all results
"""
# Validate that the image file exists
if not os.path.exists(image_path):
print(f"Error: Image file not found: {image_path}")
return
# Step 1: Extract problem from image
problem = self.extract_problem(image_path)
# Check if the image contains mathematical content
# The vision model outputs '\skip' when no math is detected
if "\\skip" in problem:
print("No math content found in the image.")
return
# Display the extracted problem for verification
print("\n=== Extracted Problem ===")
print(problem)
# Step 2: Solve the problem using the math model
solution = self.solve_problem(problem)
print("\n=== Solution ===")
print(solution)
# Step 3: Generate a concise summary
summary = self.summarize_solution(solution)
print("\n=== Summary ===")
print(summary)
# Step 4: Generate HTML output with timestamp in filename
output_filename = f"math_solution_{time.strftime('%Y%m%d_%H%M%S')}.html"
self.generate_html(problem, solution, summary, output_filename)
return output_filename
def main():
"""
Main entry point for the Math Solver CLI application.
This function handles command-line argument parsing, initializes the
MathSolver instance, and processes the specified image.
Command-line usage:
python3 math_solver_cli.py [image_path]
If no image path is provided, defaults to 'ss.png' in the current directory.
The function provides user feedback throughout the process and reports
the final output filename upon successful completion.
"""
# Parse command-line arguments
if len(sys.argv) > 1:
image_path = sys.argv[1]
else:
# Default to ss.png if no argument provided
image_path = "ss.png"
# Initialize the solver (loads all ML models)
solver = MathSolver()
# Process the image and generate output
output_file = solver.process_image(image_path)
# Report success if an output file was generated
if output_file:
print(f"\nSuccess! Open {output_file} in your browser to view the results.")
if __name__ == "__main__":
# Only run main() if this script is executed directly
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment