Last active
June 21, 2023 12:01
-
-
Save 0187773933/766f7fb825dfc2f5acc39fd285841f5b to your computer and use it in GitHub Desktop.
Markdown + KaTeX + mhchem to JPEG Converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# brew install wkhtmltopdf | |
# pip install imgkit markdown | |
import os | |
import imgkit | |
import markdown | |
import asyncio | |
import nest_asyncio | |
import time | |
from pyppeteer import launch | |
import sys | |
from pathlib import Path | |
from natsort import humansorted | |
from PIL import Image , ImageOps | |
import numpy as np | |
def run_sync( page_action ): | |
return asyncio.get_event_loop().run_until_complete( page_action ) | |
def write_text( file_path , text_lines_list ): | |
with open( file_path , 'w', encoding='utf-8' ) as f: | |
f.writelines( text_lines_list ) | |
def read_text( file_path ): | |
with open( file_path ) as f: | |
return f.read() | |
def get_md_files_in_base_path( base_path ): | |
FilesPosixInBaseDirectory = base_path.glob( '**/*' ) | |
FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.is_file() ] | |
FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.suffix == ".md" ] | |
FilesPosixInBaseDirectory = humansorted( FilesPosixInBaseDirectory ) | |
return FilesPosixInBaseDirectory | |
def get_padded_bbox( bbox , image_size , padding_percent=3 ): | |
width = ( bbox[ 2 ] - bbox[ 0 ] ) | |
height = ( bbox[ 3 ] - bbox[ 1 ] ) | |
x_padding = int( width * padding_percent / 100 ) | |
y_padding = int( height * padding_percent / 100 ) | |
padded_bbox = ( | |
max( 0 , bbox[ 0 ] - x_padding ), | |
max( 0 , bbox[ 1 ] - y_padding ), | |
min( image_size[ 0 ] , bbox[ 2 ] + x_padding ) , | |
min( image_size[ 1 ] , bbox[ 3 ] + y_padding ) | |
) | |
return padded_bbox | |
def auto_crop_image( image_path ): | |
image = Image.open( image_path ) | |
greyscale = image.convert( "L" ) | |
greyscale = ImageOps.invert( greyscale ) | |
bbox = greyscale.getbbox() | |
padded_bbox = get_padded_bbox( bbox , image.size ) | |
cropped = image.crop( padded_bbox ) | |
# cropped.show() | |
cropped.save( image_path ) | |
return cropped | |
def find_split_point(image, axis): | |
# Convert the image to a numpy array | |
image_array = np.array(image) | |
# Calculate the default split point (middle of the image) | |
default_split_point = image.size[axis] // 2 | |
# Calculate the search range (50 pixels on either side of the default split point) | |
search_range = 1000 | |
start = max(0, default_split_point - search_range) | |
end = min(image.size[axis], default_split_point + search_range) | |
# Iterate over the image in the direction of the axis | |
for i in range(start, end): | |
# Get the line of pixels at the current position | |
line = image_array[:, i] if axis == 0 else image_array[i, :] | |
# Check if the line is 100% free of pixel data | |
if np.all(line == 255): | |
# If it is, return this position as the split point | |
return i | |
# If no 100% free line was found, return the default split point | |
return default_split_point | |
# def split_image(image, max_size=(8192, 4096), padding_percent=3): | |
# width, height = image.size | |
# images = [] | |
# if width > max_size[0]: | |
# # Split the image vertically | |
# for i in range(0, width, max_size[0]): | |
# bbox = (i, 0, min(i + max_size[0], width), height) | |
# cropped = image.crop(bbox) | |
# images.append(cropped) | |
# elif height > max_size[1]: | |
# # Split the image horizontally | |
# for i in range(0, height, max_size[1]): | |
# bbox = (0, i, width, min(i + max_size[1], height)) | |
# cropped = image.crop(bbox) | |
# images.append(cropped) | |
# else: | |
# # No need to split the image | |
# images.append(image) | |
# # # Add padding if possible | |
# # for i, img in enumerate(images): | |
# # bbox = img.getbbox() | |
# # padded_bbox = get_padded_bbox(bbox, img.size, padding_percent) | |
# # images[i] = img.crop(padded_bbox) | |
# return images | |
def split_image(image, max_size): | |
# Determine the axis along which to split the image | |
axis = 0 if image.size[0] > max_size[0] else 1 | |
# If the image size is within the max size, no need to split | |
if image.size[axis] <= max_size[axis]: | |
return [image] | |
# Find the optimal split point | |
split_point = find_split_point(image, axis) | |
# If no split point was found within the search range, split the image in half | |
if split_point is None: | |
split_point = image.size[axis] // 2 | |
# Split the image at the split point | |
if axis == 0: | |
image1 = image.crop((0, 0, split_point, image.size[1])) | |
image2 = image.crop((split_point, 0, image.size[0], image.size[1])) | |
else: | |
image1 = image.crop((0, 0, image.size[0], split_point)) | |
image2 = image.crop((0, split_point, image.size[0], image.size[1])) | |
# Recursively split the images if they are still too large | |
return split_image(image1, max_size) + split_image(image2, max_size) | |
def miro_process_image( image_path , max_size=( 8190 , 4094 ) , padding_percent=3 ): | |
# Open the image | |
image = Image.open(image_path) | |
# Check if the image needs to be split | |
if image.size[0] > max_size[0] or image.size[1] > max_size[1]: | |
# Split the image | |
images = split_image(image, max_size) | |
# Save the resulting images | |
for i, img in enumerate(images): | |
# Generate a new file path for each image | |
base, ext = os.path.splitext(image_path) | |
new_path = f"{base} - Part {i+1}{ext}" | |
# Save the image | |
img.save(new_path) | |
os.remove( image_path ) | |
def get_browser(): | |
nest_asyncio.apply() | |
browser = run_sync( launch( { "headless": True } ) ) | |
return browser | |
def get_page( browser ): | |
page = run_sync( browser.newPage() ) | |
# run_sync( page.setViewport( { "width": 1280 , "height": 760 , "deviceScaleFactor": 3 } ) ) | |
run_sync( page.setViewport( { "width": 1280 , "height": 1520 , "deviceScaleFactor": 3 } ) ) | |
return page | |
def convert_md_file_to_jpeg( browser , md_file_path ): | |
output_path = md_file_path.parent.joinpath( f"{md_file_path.stem}.jpeg" ) | |
md_text = read_text( str( md_file_path ) ) | |
md_lines = md_text.splitlines() | |
if md_lines[ -1 ] == "\n": | |
md_lines = md_lines[ 0 : -1 ] | |
if md_lines[ -1 ].startswith( "#" ): | |
md_lines = md_lines[ 0 : -1 ] | |
md_text = "\n".join( md_lines ) | |
md_text += "\n<br>" | |
html = f""" | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1"> | |
<title>MD + Katex Render Test</title> | |
<script src="https://cdn.jsdelivr.net/npm/[email protected]/lib/marked.umd.min.js"></script> | |
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-3UiQGuEI4TTMaFmGIZumfRPtfKQ3trwQE2JgosJxCnGmQpL/lJdjpcHkaaFwHlcI" crossorigin="anonymous"> | |
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-G0zcxDFp5LWZtDuRMnBkk3EphCK1lhEf4UEyEM693ka574TZGwo4IWwS6QLzM/2t" crossorigin="anonymous"></script> | |
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/mhchem.min.js" integrity="sha256-8MoD3xlLjD1gF/9FXbag75iFeQVmP6MRps3teIsVNAs=" crossorigin="anonymous"></script> | |
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05" crossorigin="anonymous" ></script> | |
</head> | |
<body> | |
<!-- <div id="source">$H_2O$</div> --> | |
<div id="source"> | |
{md_text} | |
</div> | |
<script type="text/javascript"> | |
function render_md( element_id ) {{ | |
let element = document.getElementById( element_id ); | |
let text = replaceHTMLElementsInString( element.innerHTML ); | |
let md_lines = marked.parse( text ); | |
element.innerHTML = md_lines; | |
}} | |
function render_math( element_id ) {{ | |
let element = document.getElementById( element_id ); | |
// let text = element.innerHTML; | |
// let escaped_text = replaceInString( text ); | |
// console.log( escaped_text ); | |
// element.textContent = text; | |
console.log( element ); | |
renderMathInElement( element , {{ | |
strict: "ignore" , | |
delimiters: [ | |
{{ left: "$$" , right: "$$" , display: true }} , | |
{{ left: "$" , right: "$" , display: false }} | |
], | |
throwOnError : false | |
}}); | |
}} | |
function replaceInString(str) {{ | |
str = str.replace(/<[\\/]?pre[^>]*>/gi, ""); | |
str = str.replace(/<br\\s*[\\/]?[^>]*>/gi, "\\n"); | |
str = str.replace(/<div[^>]*>/gi, "\\n"); | |
// Thanks Graham A! | |
str = str.replace(/<[\\/]?span[^>]*>/gi, "") | |
str.replace(/<\\/div[^>]*>/g, "\\n"); | |
return replaceHTMLElementsInString( str ); | |
}} | |
function replaceHTMLElementsInString(str) {{ | |
str = str.replace(/ /gi, " "); | |
str = str.replace(/&tab;/gi, " "); | |
str = str.replace(/>/gi, ">"); | |
str = str.replace(/</gi, "<"); | |
return str.replace(/&/gi, "&"); | |
}} | |
function sleep( ms ) {{ return new Promise( resolve => setTimeout( resolve , ms ) ); }} | |
function ready() {{ | |
return new Promise( function( resolve , reject ) {{ | |
try {{ | |
if (document.readyState === 'loading') {{ // If document is still loading, wait for it to complete | |
document.addEventListener('DOMContentLoaded', resolve); | |
}} else {{ // If document is already loaded, resolve immediately | |
resolve(); | |
}} | |
}} | |
catch( error ) {{ console.log( error ); reject( error ); return; }} | |
}}); | |
}} | |
</script> | |
</body> | |
</html> | |
""" | |
page = get_page( browser ) | |
run_sync( page.setContent( html ) ) | |
time.sleep( 1 ) # probably good enough | |
ready = run_sync( page.evaluate( '''async () => { await ready(); render_md( "source" ); render_math( "source" ); window.scrollTo( 0 ,document.body.scrollHeight ); await sleep( 500 ); return true }''' ) ); | |
run_sync( page.screenshot( { 'path': str( output_path ) , "fullPage": True , "quality": 100 } ) ) | |
run_sync( page.close() ) | |
auto_crop_image( str( output_path ) ) | |
miro_process_image( str( output_path ) ) | |
return | |
if __name__ == "__main__": | |
nest_asyncio.apply() | |
# md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/General Chemistry" ) ) | |
md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/Physics" ) ) | |
total_md_files = len( md_files ) | |
browser = get_browser() | |
for index , file in enumerate( md_files ): | |
print( f"[ { index + 1 } ] of {total_md_files}" ) | |
convert_md_file_to_jpeg( browser , file ) | |
run_sync( browser.close() ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment