Skip to content

Instantly share code, notes, and snippets.

@0187773933
Last active June 21, 2023 12:01
Show Gist options
  • Save 0187773933/766f7fb825dfc2f5acc39fd285841f5b to your computer and use it in GitHub Desktop.
Save 0187773933/766f7fb825dfc2f5acc39fd285841f5b to your computer and use it in GitHub Desktop.
Markdown + KaTeX + mhchem to JPEG Converter
#!/usr/bin/env python3
# brew install wkhtmltopdf
# pip install imgkit markdown
import os
import imgkit
import markdown
import asyncio
import nest_asyncio
import time
from pyppeteer import launch
import sys
from pathlib import Path
from natsort import humansorted
from PIL import Image , ImageOps
import numpy as np
def run_sync( page_action ):
return asyncio.get_event_loop().run_until_complete( page_action )
def write_text( file_path , text_lines_list ):
with open( file_path , 'w', encoding='utf-8' ) as f:
f.writelines( text_lines_list )
def read_text( file_path ):
with open( file_path ) as f:
return f.read()
def get_md_files_in_base_path( base_path ):
FilesPosixInBaseDirectory = base_path.glob( '**/*' )
FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.is_file() ]
FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.suffix == ".md" ]
FilesPosixInBaseDirectory = humansorted( FilesPosixInBaseDirectory )
return FilesPosixInBaseDirectory
def get_padded_bbox( bbox , image_size , padding_percent=3 ):
width = ( bbox[ 2 ] - bbox[ 0 ] )
height = ( bbox[ 3 ] - bbox[ 1 ] )
x_padding = int( width * padding_percent / 100 )
y_padding = int( height * padding_percent / 100 )
padded_bbox = (
max( 0 , bbox[ 0 ] - x_padding ),
max( 0 , bbox[ 1 ] - y_padding ),
min( image_size[ 0 ] , bbox[ 2 ] + x_padding ) ,
min( image_size[ 1 ] , bbox[ 3 ] + y_padding )
)
return padded_bbox
def auto_crop_image( image_path ):
image = Image.open( image_path )
greyscale = image.convert( "L" )
greyscale = ImageOps.invert( greyscale )
bbox = greyscale.getbbox()
padded_bbox = get_padded_bbox( bbox , image.size )
cropped = image.crop( padded_bbox )
# cropped.show()
cropped.save( image_path )
return cropped
def find_split_point(image, axis):
# Convert the image to a numpy array
image_array = np.array(image)
# Calculate the default split point (middle of the image)
default_split_point = image.size[axis] // 2
# Calculate the search range (50 pixels on either side of the default split point)
search_range = 1000
start = max(0, default_split_point - search_range)
end = min(image.size[axis], default_split_point + search_range)
# Iterate over the image in the direction of the axis
for i in range(start, end):
# Get the line of pixels at the current position
line = image_array[:, i] if axis == 0 else image_array[i, :]
# Check if the line is 100% free of pixel data
if np.all(line == 255):
# If it is, return this position as the split point
return i
# If no 100% free line was found, return the default split point
return default_split_point
# def split_image(image, max_size=(8192, 4096), padding_percent=3):
# width, height = image.size
# images = []
# if width > max_size[0]:
# # Split the image vertically
# for i in range(0, width, max_size[0]):
# bbox = (i, 0, min(i + max_size[0], width), height)
# cropped = image.crop(bbox)
# images.append(cropped)
# elif height > max_size[1]:
# # Split the image horizontally
# for i in range(0, height, max_size[1]):
# bbox = (0, i, width, min(i + max_size[1], height))
# cropped = image.crop(bbox)
# images.append(cropped)
# else:
# # No need to split the image
# images.append(image)
# # # Add padding if possible
# # for i, img in enumerate(images):
# # bbox = img.getbbox()
# # padded_bbox = get_padded_bbox(bbox, img.size, padding_percent)
# # images[i] = img.crop(padded_bbox)
# return images
def split_image(image, max_size):
# Determine the axis along which to split the image
axis = 0 if image.size[0] > max_size[0] else 1
# If the image size is within the max size, no need to split
if image.size[axis] <= max_size[axis]:
return [image]
# Find the optimal split point
split_point = find_split_point(image, axis)
# If no split point was found within the search range, split the image in half
if split_point is None:
split_point = image.size[axis] // 2
# Split the image at the split point
if axis == 0:
image1 = image.crop((0, 0, split_point, image.size[1]))
image2 = image.crop((split_point, 0, image.size[0], image.size[1]))
else:
image1 = image.crop((0, 0, image.size[0], split_point))
image2 = image.crop((0, split_point, image.size[0], image.size[1]))
# Recursively split the images if they are still too large
return split_image(image1, max_size) + split_image(image2, max_size)
def miro_process_image( image_path , max_size=( 8190 , 4094 ) , padding_percent=3 ):
# Open the image
image = Image.open(image_path)
# Check if the image needs to be split
if image.size[0] > max_size[0] or image.size[1] > max_size[1]:
# Split the image
images = split_image(image, max_size)
# Save the resulting images
for i, img in enumerate(images):
# Generate a new file path for each image
base, ext = os.path.splitext(image_path)
new_path = f"{base} - Part {i+1}{ext}"
# Save the image
img.save(new_path)
os.remove( image_path )
def get_browser():
nest_asyncio.apply()
browser = run_sync( launch( { "headless": True } ) )
return browser
def get_page( browser ):
page = run_sync( browser.newPage() )
# run_sync( page.setViewport( { "width": 1280 , "height": 760 , "deviceScaleFactor": 3 } ) )
run_sync( page.setViewport( { "width": 1280 , "height": 1520 , "deviceScaleFactor": 3 } ) )
return page
def convert_md_file_to_jpeg( browser , md_file_path ):
output_path = md_file_path.parent.joinpath( f"{md_file_path.stem}.jpeg" )
md_text = read_text( str( md_file_path ) )
md_lines = md_text.splitlines()
if md_lines[ -1 ] == "\n":
md_lines = md_lines[ 0 : -1 ]
if md_lines[ -1 ].startswith( "#" ):
md_lines = md_lines[ 0 : -1 ]
md_text = "\n".join( md_lines )
md_text += "\n<br>"
html = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>MD + Katex Render Test</title>
<script src="https://cdn.jsdelivr.net/npm/[email protected]/lib/marked.umd.min.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-3UiQGuEI4TTMaFmGIZumfRPtfKQ3trwQE2JgosJxCnGmQpL/lJdjpcHkaaFwHlcI" crossorigin="anonymous">
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-G0zcxDFp5LWZtDuRMnBkk3EphCK1lhEf4UEyEM693ka574TZGwo4IWwS6QLzM/2t" crossorigin="anonymous"></script>
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/mhchem.min.js" integrity="sha256-8MoD3xlLjD1gF/9FXbag75iFeQVmP6MRps3teIsVNAs=" crossorigin="anonymous"></script>
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05" crossorigin="anonymous" ></script>
</head>
<body>
<!-- <div id="source">$H_2O$</div> -->
<div id="source">
{md_text}
</div>
<script type="text/javascript">
function render_md( element_id ) {{
let element = document.getElementById( element_id );
let text = replaceHTMLElementsInString( element.innerHTML );
let md_lines = marked.parse( text );
element.innerHTML = md_lines;
}}
function render_math( element_id ) {{
let element = document.getElementById( element_id );
// let text = element.innerHTML;
// let escaped_text = replaceInString( text );
// console.log( escaped_text );
// element.textContent = text;
console.log( element );
renderMathInElement( element , {{
strict: "ignore" ,
delimiters: [
{{ left: "$$" , right: "$$" , display: true }} ,
{{ left: "$" , right: "$" , display: false }}
],
throwOnError : false
}});
}}
function replaceInString(str) {{
str = str.replace(/<[\\/]?pre[^>]*>/gi, "");
str = str.replace(/<br\\s*[\\/]?[^>]*>/gi, "\\n");
str = str.replace(/<div[^>]*>/gi, "\\n");
// Thanks Graham A!
str = str.replace(/<[\\/]?span[^>]*>/gi, "")
str.replace(/<\\/div[^>]*>/g, "\\n");
return replaceHTMLElementsInString( str );
}}
function replaceHTMLElementsInString(str) {{
str = str.replace(/&nbsp;/gi, " ");
str = str.replace(/&tab;/gi, " ");
str = str.replace(/&gt;/gi, ">");
str = str.replace(/&lt;/gi, "<");
return str.replace(/&amp;/gi, "&");
}}
function sleep( ms ) {{ return new Promise( resolve => setTimeout( resolve , ms ) ); }}
function ready() {{
return new Promise( function( resolve , reject ) {{
try {{
if (document.readyState === 'loading') {{ // If document is still loading, wait for it to complete
document.addEventListener('DOMContentLoaded', resolve);
}} else {{ // If document is already loaded, resolve immediately
resolve();
}}
}}
catch( error ) {{ console.log( error ); reject( error ); return; }}
}});
}}
</script>
</body>
</html>
"""
page = get_page( browser )
run_sync( page.setContent( html ) )
time.sleep( 1 ) # probably good enough
ready = run_sync( page.evaluate( '''async () => { await ready(); render_md( "source" ); render_math( "source" ); window.scrollTo( 0 ,document.body.scrollHeight ); await sleep( 500 ); return true }''' ) );
run_sync( page.screenshot( { 'path': str( output_path ) , "fullPage": True , "quality": 100 } ) )
run_sync( page.close() )
auto_crop_image( str( output_path ) )
miro_process_image( str( output_path ) )
return
if __name__ == "__main__":
nest_asyncio.apply()
# md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/General Chemistry" ) )
md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/Physics" ) )
total_md_files = len( md_files )
browser = get_browser()
for index , file in enumerate( md_files ):
print( f"[ { index + 1 } ] of {total_md_files}" )
convert_md_file_to_jpeg( browser , file )
run_sync( browser.close() )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment