Skip to content

Instantly share code, notes, and snippets.

@danielhavir
Forked from danielgross/mathpix2gpt.py
Created August 1, 2023 00:17
Show Gist options
  • Save danielhavir/bfed8277446747050054d4681dbfe898 to your computer and use it in GitHub Desktop.
Save danielhavir/bfed8277446747050054d4681dbfe898 to your computer and use it in GitHub Desktop.
mathpix2gpt.py
import requests
import time
import os
import sys
import openai
import tiktoken
from termcolor import colored
openai.api_key = open(os.path.expanduser('~/.openai')).read().strip()
USE_GPT_4_32K = True
DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301"
TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat.
IS_SCIENTIFIC = True
# Assumes you have a file called ~/.mathpix with the first line containing your app_id and
# the second line containing your app_key
with open(os.path.expanduser('~/.mathpix')) as f:
APP_ID = f.readline().strip()
APP_KEY = f.readline().strip()
def send_pdf_to_mathpix(file_path, output_format='mmd'):
url = 'https://api.mathpix.com/v3/pdf'
headers = {
'app_id': APP_ID,
'app_key': APP_KEY
}
with open(file_path, 'rb') as file:
files = {'file': file}
options = {
'options_json': '{"conversion_formats": {"%s": true}}' % output_format
}
print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix")
response = requests.post(url, headers=headers,
files=files, data=options)
response_data = response.json()
if 'pdf_id' in response_data:
pdf_id = response_data['pdf_id']
print(f"PDF ID: {pdf_id}")
return pdf_id
else:
print("Error: Unable to send PDF to Mathpix")
return None
def wait_for_processing(pdf_id):
url = f'https://api.mathpix.com/v3/pdf/{pdf_id}'
headers = {
'app_id': APP_ID,
'app_key': APP_KEY
}
while True:
response = requests.get(url, headers=headers)
response_data = response.json()
status = response_data.get('status', None)
if status == 'completed':
print("Processing complete")
return True
elif status == 'error':
print("Error: Unable to process PDF")
return False
else:
print(f"Status: {status}, waiting for processing to complete")
time.sleep(5)
def download_processed_file(pdf_id, file_format, output_path):
url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}'
headers = {
'app_id': APP_ID,
'app_key': APP_KEY
}
response = requests.get(url, headers=headers)
with open(output_path, 'wb') as output_file:
output_file.write(response.content)
print(f"File downloaded to {output_path}")
def clear_terminal():
os.system('cls' if os.name == 'nt' else 'clear')
def print_messages(messages):
for index, message in enumerate(messages):
color = 'blue' if message['role'] == 'assistant' else 'white'
print(
colored(f"{message['role'].capitalize()}: {message['content']}", color))
def chat_gpt(messages):
result = openai.ChatCompletion.create(
model=DEFAULT_MODEL,
messages=messages
)
answer = result.choices[0].message.content
messages.append({"role": "assistant", "content": answer})
return messages
def start_question_answering(input_path):
print("Using model: %s" % DEFAULT_MODEL)
with open(input_path) as fh:
data = fh.read()
text = data.strip()
tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL)
text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT])
text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text},
]
messages = chat_gpt(messages)
while True:
clear_terminal()
print_messages(messages)
question = input("User: ")
if question.lower() == 'exit':
break
messages.append({"role": "user", "content": question})
messages = chat_gpt(messages)
def main():
if len(sys.argv) < 2:
print("Usage: python pdfvqa.py <input_pdf_path>")
return
input_pdf_path = sys.argv[1]
output_mmd_path = input_pdf_path.replace('.pdf', '.md')
output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md')
if not os.path.exists(output_mmd_path):
pdf_id = send_pdf_to_mathpix(input_pdf_path)
if pdf_id and wait_for_processing(pdf_id):
download_processed_file(pdf_id, 'mmd', output_mmd_path)
if not os.path.exists(output_simplemd_path):
with open(output_mmd_path, 'r') as mmd_file:
mmd = mmd_file.read()
if not IS_SCIENTIFIC:
# There's too much LaTeX style escaping for most PDFs in my view, so remove some of it.
# Keep it if the paper is a scientific paper.
mmd = '\n'.join([line for line in mmd.split(
'\n') if not line.startswith('![]')])
# replace \section{Title} with # Title
mmd = mmd.replace('\\section{', '# ').replace('}', '')
# replace the "\" slash that Mathpix adds to escape $, %, (, etc.
mmd = mmd.replace('\$', '$').replace(
'\%', '%').replace('\(', '(').replace('\)', ')')
with open(output_simplemd_path, 'w') as simplemd_file:
simplemd_file.write(mmd)
start_question_answering(output_simplemd_path)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment