-
-
Save danielhavir/bfed8277446747050054d4681dbfe898 to your computer and use it in GitHub Desktop.
mathpix2gpt.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import time | |
import os | |
import sys | |
import openai | |
import tiktoken | |
from termcolor import colored | |
openai.api_key = open(os.path.expanduser('~/.openai')).read().strip() | |
USE_GPT_4_32K = True | |
DEFAULT_MODEL = "gpt-4-32k" if USE_GPT_4_32K else "gpt-3.5-turbo-0301" | |
TOKEN_LIMIT = 31000 if USE_GPT_4_32K else 3096 # Leave some room for the chat. | |
IS_SCIENTIFIC = True | |
# Assumes you have a file called ~/.mathpix with the first line containing your app_id and | |
# the second line containing your app_key | |
with open(os.path.expanduser('~/.mathpix')) as f: | |
APP_ID = f.readline().strip() | |
APP_KEY = f.readline().strip() | |
def send_pdf_to_mathpix(file_path, output_format='mmd'): | |
url = 'https://api.mathpix.com/v3/pdf' | |
headers = { | |
'app_id': APP_ID, | |
'app_key': APP_KEY | |
} | |
with open(file_path, 'rb') as file: | |
files = {'file': file} | |
options = { | |
'options_json': '{"conversion_formats": {"%s": true}}' % output_format | |
} | |
print(f"Sending {os.path.getsize(file_path) / 1000} kb to Mathpix") | |
response = requests.post(url, headers=headers, | |
files=files, data=options) | |
response_data = response.json() | |
if 'pdf_id' in response_data: | |
pdf_id = response_data['pdf_id'] | |
print(f"PDF ID: {pdf_id}") | |
return pdf_id | |
else: | |
print("Error: Unable to send PDF to Mathpix") | |
return None | |
def wait_for_processing(pdf_id): | |
url = f'https://api.mathpix.com/v3/pdf/{pdf_id}' | |
headers = { | |
'app_id': APP_ID, | |
'app_key': APP_KEY | |
} | |
while True: | |
response = requests.get(url, headers=headers) | |
response_data = response.json() | |
status = response_data.get('status', None) | |
if status == 'completed': | |
print("Processing complete") | |
return True | |
elif status == 'error': | |
print("Error: Unable to process PDF") | |
return False | |
else: | |
print(f"Status: {status}, waiting for processing to complete") | |
time.sleep(5) | |
def download_processed_file(pdf_id, file_format, output_path): | |
url = f'https://api.mathpix.com/v3/pdf/{pdf_id}.{file_format}' | |
headers = { | |
'app_id': APP_ID, | |
'app_key': APP_KEY | |
} | |
response = requests.get(url, headers=headers) | |
with open(output_path, 'wb') as output_file: | |
output_file.write(response.content) | |
print(f"File downloaded to {output_path}") | |
def clear_terminal(): | |
os.system('cls' if os.name == 'nt' else 'clear') | |
def print_messages(messages): | |
for index, message in enumerate(messages): | |
color = 'blue' if message['role'] == 'assistant' else 'white' | |
print( | |
colored(f"{message['role'].capitalize()}: {message['content']}", color)) | |
def chat_gpt(messages): | |
result = openai.ChatCompletion.create( | |
model=DEFAULT_MODEL, | |
messages=messages | |
) | |
answer = result.choices[0].message.content | |
messages.append({"role": "assistant", "content": answer}) | |
return messages | |
def start_question_answering(input_path): | |
print("Using model: %s" % DEFAULT_MODEL) | |
with open(input_path) as fh: | |
data = fh.read() | |
text = data.strip() | |
tokenizer = tiktoken.encoding_for_model(DEFAULT_MODEL) | |
text = tokenizer.decode(tokenizer.encode(text)[:TOKEN_LIMIT]) | |
text = '\n' + '-' * 50 + '\n' + text + '\n' + '-' * 50 | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Here is some content from a PDF I extracted to Markdown. %s" % text}, | |
] | |
messages = chat_gpt(messages) | |
while True: | |
clear_terminal() | |
print_messages(messages) | |
question = input("User: ") | |
if question.lower() == 'exit': | |
break | |
messages.append({"role": "user", "content": question}) | |
messages = chat_gpt(messages) | |
def main(): | |
if len(sys.argv) < 2: | |
print("Usage: python pdfvqa.py <input_pdf_path>") | |
return | |
input_pdf_path = sys.argv[1] | |
output_mmd_path = input_pdf_path.replace('.pdf', '.md') | |
output_simplemd_path = input_pdf_path.replace('.pdf', '.simple.md') | |
if not os.path.exists(output_mmd_path): | |
pdf_id = send_pdf_to_mathpix(input_pdf_path) | |
if pdf_id and wait_for_processing(pdf_id): | |
download_processed_file(pdf_id, 'mmd', output_mmd_path) | |
if not os.path.exists(output_simplemd_path): | |
with open(output_mmd_path, 'r') as mmd_file: | |
mmd = mmd_file.read() | |
if not IS_SCIENTIFIC: | |
# There's too much LaTeX style escaping for most PDFs in my view, so remove some of it. | |
# Keep it if the paper is a scientific paper. | |
mmd = '\n'.join([line for line in mmd.split( | |
'\n') if not line.startswith('![]')]) | |
# replace \section{Title} with # Title | |
mmd = mmd.replace('\\section{', '# ').replace('}', '') | |
# replace the "\" slash that Mathpix adds to escape $, %, (, etc. | |
mmd = mmd.replace('\$', '$').replace( | |
'\%', '%').replace('\(', '(').replace('\)', ')') | |
with open(output_simplemd_path, 'w') as simplemd_file: | |
simplemd_file.write(mmd) | |
start_question_answering(output_simplemd_path) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment