0187773933 · June 21, 2023 12:01
diff --git a/MDToJPEG.py b/MDToJPEG.py
 #!/usr/bin/env python3
 # brew install wkhtmltopdf
 # pip install imgkit markdown
 import os
 import imgkit
 import markdown
 import asyncio
 import nest_asyncio
 import time
 from pyppeteer import launch
 import sys
 from pathlib import Path
 from natsort import humansorted
 from PIL import Image , ImageOps
 import numpy as np

 def run_sync( page_action ):
 	return asyncio.get_event_loop().run_until_complete( page_action )

 def write_text( file_path , text_lines_list ):
 	with open( file_path , 'w', encoding='utf-8' ) as f:
 		f.writelines( text_lines_list )

 def read_text( file_path ):
 	with open( file_path ) as f:
 		return f.read()

 def get_md_files_in_base_path( base_path ):
 	FilesPosixInBaseDirectory = base_path.glob( '**/*' )
 	FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.is_file() ]
 	FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.suffix == ".md" ]
 	FilesPosixInBaseDirectory = humansorted( FilesPosixInBaseDirectory )
 	return FilesPosixInBaseDirectory

 def get_padded_bbox( bbox , image_size , padding_percent=3 ):
 	width = ( bbox[ 2 ] - bbox[ 0 ] )
 	height = ( bbox[ 3 ] - bbox[ 1 ] )
 	x_padding = int( width * padding_percent / 100 )
 	y_padding = int( height * padding_percent / 100 )
 	padded_bbox = (
 		max( 0 , bbox[ 0 ] - x_padding ),
 		max( 0 , bbox[ 1 ] - y_padding ),
 		min( image_size[ 0 ] , bbox[ 2 ] + x_padding ) ,
 		min( image_size[ 1 ] , bbox[ 3 ] + y_padding )
 	)
 	return padded_bbox

 def auto_crop_image( image_path ):
 	image = Image.open( image_path )
 	greyscale = image.convert( "L" )
 	greyscale = ImageOps.invert( greyscale )
 	bbox = greyscale.getbbox()
 	padded_bbox = get_padded_bbox( bbox , image.size )
 	cropped = image.crop( padded_bbox )
 	# cropped.show()
 	cropped.save( image_path )
 	return cropped

 def find_split_point(image, axis):
    # Convert the image to a numpy array
    image_array = np.array(image)

    # Calculate the default split point (middle of the image)
    default_split_point = image.size[axis] // 2

    # Calculate the search range (50 pixels on either side of the default split point)
    search_range = 1000
    start = max(0, default_split_point - search_range)
    end = min(image.size[axis], default_split_point + search_range)

    # Iterate over the image in the direction of the axis
    for i in range(start, end):
        # Get the line of pixels at the current position
        line = image_array[:, i] if axis == 0 else image_array[i, :]

        # Check if the line is 100% free of pixel data
        if np.all(line == 255):
            # If it is, return this position as the split point
            return i

    # If no 100% free line was found, return the default split point
    return default_split_point

 # def split_image(image, max_size=(8192, 4096), padding_percent=3):
 # 	width, height = image.size
 # 	images = []

 # 	if width > max_size[0]:
 # 		# Split the image vertically
 # 		for i in range(0, width, max_size[0]):
 # 			bbox = (i, 0, min(i + max_size[0], width), height)
 # 			cropped = image.crop(bbox)
 # 			images.append(cropped)
 # 	elif height > max_size[1]:
 # 		# Split the image horizontally
 # 		for i in range(0, height, max_size[1]):
 # 			bbox = (0, i, width, min(i + max_size[1], height))
 # 			cropped = image.crop(bbox)
 # 			images.append(cropped)
 # 	else:
 # 		# No need to split the image
 # 		images.append(image)

 # 	# # Add padding if possible
 # 	# for i, img in enumerate(images):
 # 	# 	bbox = img.getbbox()
 # 	# 	padded_bbox = get_padded_bbox(bbox, img.size, padding_percent)
 # 	# 	images[i] = img.crop(padded_bbox)

 # 	return images

 def split_image(image, max_size):
    # Determine the axis along which to split the image
    axis = 0 if image.size[0] > max_size[0] else 1

    # If the image size is within the max size, no need to split
    if image.size[axis] <= max_size[axis]:
        return [image]

    # Find the optimal split point
    split_point = find_split_point(image, axis)

    # If no split point was found within the search range, split the image in half
    if split_point is None:
        split_point = image.size[axis] // 2

    # Split the image at the split point
    if axis == 0:
        image1 = image.crop((0, 0, split_point, image.size[1]))
        image2 = image.crop((split_point, 0, image.size[0], image.size[1]))
    else:
        image1 = image.crop((0, 0, image.size[0], split_point))
        image2 = image.crop((0, split_point, image.size[0], image.size[1]))

    # Recursively split the images if they are still too large
    return split_image(image1, max_size) + split_image(image2, max_size)

 def miro_process_image( image_path , max_size=( 8190 , 4094 ) , padding_percent=3 ):
 	# Open the image
 	image = Image.open(image_path)

 	# Check if the image needs to be split
 	if image.size[0] > max_size[0] or image.size[1] > max_size[1]:
 		# Split the image
 		images = split_image(image, max_size)

 		# Save the resulting images
 		for i, img in enumerate(images):
 			# Generate a new file path for each image
 			base, ext = os.path.splitext(image_path)
 			new_path = f"{base} - Part {i+1}{ext}"

 			# Save the image
 			img.save(new_path)
 		os.remove( image_path )


 def get_browser():
 	nest_asyncio.apply()
 	browser = run_sync( launch( { "headless": True } ) )
 	return browser

 def get_page( browser ):
 	page = run_sync( browser.newPage() )
 	# run_sync( page.setViewport( { "width": 1280 , "height": 760 , "deviceScaleFactor": 3 } ) )
 	run_sync( page.setViewport( { "width": 1280 , "height": 1520 , "deviceScaleFactor": 3 } ) )
 	return page

 def convert_md_file_to_jpeg( browser , md_file_path ):
 	output_path = md_file_path.parent.joinpath( f"{md_file_path.stem}.jpeg" )
 	md_text = read_text( str( md_file_path ) )
 	md_lines = md_text.splitlines()
 	if md_lines[ -1 ] == "\n":
 		md_lines = md_lines[ 0 : -1 ]
 	if md_lines[ -1 ].startswith( "#" ):
 		md_lines = md_lines[ 0 : -1 ]
 	md_text = "\n".join( md_lines )
 	md_text += "\n<br>"
 	html = f"""
 <!DOCTYPE html>
 <html>
 <head>
 	<meta charset="utf-8">
 	<meta name="viewport" content="width=device-width, initial-scale=1">
 	<title>MD + Katex Render Test</title>
 	<script src="https://cdn.jsdelivr.net/npm/[email protected]/lib/marked.umd.min.js"></script>
 	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-3UiQGuEI4TTMaFmGIZumfRPtfKQ3trwQE2JgosJxCnGmQpL/lJdjpcHkaaFwHlcI" crossorigin="anonymous">
 	<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-G0zcxDFp5LWZtDuRMnBkk3EphCK1lhEf4UEyEM693ka574TZGwo4IWwS6QLzM/2t" crossorigin="anonymous"></script>
 	<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/mhchem.min.js" integrity="sha256-8MoD3xlLjD1gF/9FXbag75iFeQVmP6MRps3teIsVNAs=" crossorigin="anonymous"></script>
 	<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05" crossorigin="anonymous" ></script>
 </head>
 <body>

 	<!-- <div id="source">$H_2O$</div> -->
 	<div id="source">
 {md_text}
 	</div>

 	<script type="text/javascript">
 		function render_md( element_id ) {{
 			let element = document.getElementById( element_id );
 			let text = replaceHTMLElementsInString( element.innerHTML );
 			let md_lines = marked.parse( text );
 			element.innerHTML = md_lines;
 		}}
 		function render_math( element_id ) {{
 			let element = document.getElementById( element_id );
 			// let text = element.innerHTML;
 			// let escaped_text = replaceInString( text );
 			// console.log( escaped_text );
 			// element.textContent = text;

 			console.log( element );
 			renderMathInElement( element , {{
 				strict: "ignore" ,
 				delimiters:  [
 					{{ left: "$$" , right: "$$" , display: true }} ,
 					{{ left: "$" , right: "$" , display: false }}
 				],
 				throwOnError : false
 			}});

 		}}
 		function replaceInString(str) {{
 			str = str.replace(/<[\\/]?pre[^>]*>/gi, "");
 			str = str.replace(/<br\\s*[\\/]?[^>]*>/gi, "\\n");
 			str = str.replace(/<div[^>]*>/gi, "\\n");
 			// Thanks Graham A!
 			str = str.replace(/<[\\/]?span[^>]*>/gi, "")
 			str.replace(/<\\/div[^>]*>/g, "\\n");
 			return replaceHTMLElementsInString( str );
 		}}

 		function replaceHTMLElementsInString(str) {{
 			str = str.replace(/&nbsp;/gi, " ");
 			str = str.replace(/&tab;/gi, "	");
 			str = str.replace(/&gt;/gi, ">");
 			str = str.replace(/&lt;/gi, "<");
 			return str.replace(/&amp;/gi, "&");
 		}}

 		function sleep( ms ) {{ return new Promise( resolve => setTimeout( resolve , ms ) ); }}

 		function ready() {{
 			return new Promise( function( resolve , reject ) {{
 				try {{
 					if (document.readyState === 'loading') {{  // If document is still loading, wait for it to complete
 						document.addEventListener('DOMContentLoaded', resolve);
 					}} else {{  // If document is already loaded, resolve immediately
 						resolve();
 					}}
 				}}
 				catch( error ) {{ console.log( error ); reject( error ); return; }}
 			}});
 		}}
 	</script>
 </body>
 </html>
 	"""

 	page = get_page( browser )
 	run_sync( page.setContent( html ) )
 	time.sleep( 1 ) # probably good enough
 	ready = run_sync( page.evaluate( '''async () => { await ready(); render_md( "source" ); render_math( "source" ); window.scrollTo( 0 ,document.body.scrollHeight ); await sleep( 500 ); return true }''' ) );
 	run_sync( page.screenshot( { 'path': str( output_path ) , "fullPage": True , "quality": 100 } ) )
 	run_sync( page.close() )
 	auto_crop_image( str( output_path ) )
 	miro_process_image( str( output_path ) )
 	return

 if __name__ == "__main__":
 	nest_asyncio.apply()

 	# md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/General Chemistry" ) )
 	md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/Physics" ) )
 	total_md_files = len( md_files )
 	browser = get_browser()
 	for index , file in enumerate( md_files ):
 		print( f"[ { index + 1 } ] of {total_md_files}" )
 		convert_md_file_to_jpeg( browser , file )
 	run_sync( browser.close() )
	#!/usr/bin/env python3
	# brew install wkhtmltopdf
	# pip install imgkit markdown
	import os
	import imgkit
	import markdown
	import asyncio
	import nest_asyncio
	import time
	from pyppeteer import launch
	import sys
	from pathlib import Path
	from natsort import humansorted
	from PIL import Image , ImageOps
	import numpy as np

	def run_sync( page_action ):
	return asyncio.get_event_loop().run_until_complete( page_action )

	def write_text( file_path , text_lines_list ):
	with open( file_path , 'w', encoding='utf-8' ) as f:
	f.writelines( text_lines_list )

	def read_text( file_path ):
	with open( file_path ) as f:
	return f.read()

	def get_md_files_in_base_path( base_path ):
	FilesPosixInBaseDirectory = base_path.glob( '*/' )
	FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.is_file() ]
	FilesPosixInBaseDirectory = [ x for x in FilesPosixInBaseDirectory if x.suffix == ".md" ]
	FilesPosixInBaseDirectory = humansorted( FilesPosixInBaseDirectory )
	return FilesPosixInBaseDirectory

	def get_padded_bbox( bbox , image_size , padding_percent=3 ):
	width = ( bbox[ 2 ] - bbox[ 0 ] )
	height = ( bbox[ 3 ] - bbox[ 1 ] )
	x_padding = int( width * padding_percent / 100 )
	y_padding = int( height * padding_percent / 100 )
	padded_bbox = (
	max( 0 , bbox[ 0 ] - x_padding ),
	max( 0 , bbox[ 1 ] - y_padding ),
	min( image_size[ 0 ] , bbox[ 2 ] + x_padding ) ,
	min( image_size[ 1 ] , bbox[ 3 ] + y_padding )
	)
	return padded_bbox

	def auto_crop_image( image_path ):
	image = Image.open( image_path )
	greyscale = image.convert( "L" )
	greyscale = ImageOps.invert( greyscale )
	bbox = greyscale.getbbox()
	padded_bbox = get_padded_bbox( bbox , image.size )
	cropped = image.crop( padded_bbox )
	# cropped.show()
	cropped.save( image_path )
	return cropped

	def find_split_point(image, axis):
	# Convert the image to a numpy array
	image_array = np.array(image)

	# Calculate the default split point (middle of the image)
	default_split_point = image.size[axis] // 2

	# Calculate the search range (50 pixels on either side of the default split point)
	search_range = 1000
	start = max(0, default_split_point - search_range)
	end = min(image.size[axis], default_split_point + search_range)

	# Iterate over the image in the direction of the axis
	for i in range(start, end):
	# Get the line of pixels at the current position
	line = image_array[:, i] if axis == 0 else image_array[i, :]

	# Check if the line is 100% free of pixel data
	if np.all(line == 255):
	# If it is, return this position as the split point
	return i

	# If no 100% free line was found, return the default split point
	return default_split_point

	# def split_image(image, max_size=(8192, 4096), padding_percent=3):
	# width, height = image.size
	# images = []

	# if width > max_size[0]:
	# # Split the image vertically
	# for i in range(0, width, max_size[0]):
	# bbox = (i, 0, min(i + max_size[0], width), height)
	# cropped = image.crop(bbox)
	# images.append(cropped)
	# elif height > max_size[1]:
	# # Split the image horizontally
	# for i in range(0, height, max_size[1]):
	# bbox = (0, i, width, min(i + max_size[1], height))
	# cropped = image.crop(bbox)
	# images.append(cropped)
	# else:
	# # No need to split the image
	# images.append(image)

	# # # Add padding if possible
	# # for i, img in enumerate(images):
	# # bbox = img.getbbox()
	# # padded_bbox = get_padded_bbox(bbox, img.size, padding_percent)
	# # images[i] = img.crop(padded_bbox)

	# return images

	def split_image(image, max_size):
	# Determine the axis along which to split the image
	axis = 0 if image.size[0] > max_size[0] else 1

	# If the image size is within the max size, no need to split
	if image.size[axis] <= max_size[axis]:
	return [image]

	# Find the optimal split point
	split_point = find_split_point(image, axis)

	# If no split point was found within the search range, split the image in half
	if split_point is None:
	split_point = image.size[axis] // 2

	# Split the image at the split point
	if axis == 0:
	image1 = image.crop((0, 0, split_point, image.size[1]))
	image2 = image.crop((split_point, 0, image.size[0], image.size[1]))
	else:
	image1 = image.crop((0, 0, image.size[0], split_point))
	image2 = image.crop((0, split_point, image.size[0], image.size[1]))

	# Recursively split the images if they are still too large
	return split_image(image1, max_size) + split_image(image2, max_size)

	def miro_process_image( image_path , max_size=( 8190 , 4094 ) , padding_percent=3 ):
	# Open the image
	image = Image.open(image_path)

	# Check if the image needs to be split
	if image.size[0] > max_size[0] or image.size[1] > max_size[1]:
	# Split the image
	images = split_image(image, max_size)

	# Save the resulting images
	for i, img in enumerate(images):
	# Generate a new file path for each image
	base, ext = os.path.splitext(image_path)
	new_path = f"{base} - Part {i+1}{ext}"

	# Save the image
	img.save(new_path)
	os.remove( image_path )


	def get_browser():
	nest_asyncio.apply()
	browser = run_sync( launch( { "headless": True } ) )
	return browser

	def get_page( browser ):
	page = run_sync( browser.newPage() )
	# run_sync( page.setViewport( { "width": 1280 , "height": 760 , "deviceScaleFactor": 3 } ) )
	run_sync( page.setViewport( { "width": 1280 , "height": 1520 , "deviceScaleFactor": 3 } ) )
	return page

	def convert_md_file_to_jpeg( browser , md_file_path ):
	output_path = md_file_path.parent.joinpath( f"{md_file_path.stem}.jpeg" )
	md_text = read_text( str( md_file_path ) )
	md_lines = md_text.splitlines()
	if md_lines[ -1 ] == "\n":
	md_lines = md_lines[ 0 : -1 ]
	if md_lines[ -1 ].startswith( "#" ):
	md_lines = md_lines[ 0 : -1 ]
	md_text = "\n".join( md_lines )
	md_text += "\n<br>"
	html = f"""
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>MD + Katex Render Test</title>
	<script src="https://cdn.jsdelivr.net/npm/[email protected]/lib/marked.umd.min.js"></script>
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-3UiQGuEI4TTMaFmGIZumfRPtfKQ3trwQE2JgosJxCnGmQpL/lJdjpcHkaaFwHlcI" crossorigin="anonymous">
	<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-G0zcxDFp5LWZtDuRMnBkk3EphCK1lhEf4UEyEM693ka574TZGwo4IWwS6QLzM/2t" crossorigin="anonymous"></script>
	<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/mhchem.min.js" integrity="sha256-8MoD3xlLjD1gF/9FXbag75iFeQVmP6MRps3teIsVNAs=" crossorigin="anonymous"></script>
	<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05" crossorigin="anonymous" ></script>
	</head>
	<body>

	<!-- <div id="source">$H_2O$</div> -->
	<div id="source">
	{md_text}
	</div>

	<script type="text/javascript">
	function render_md( element_id ) {{
	let element = document.getElementById( element_id );
	let text = replaceHTMLElementsInString( element.innerHTML );
	let md_lines = marked.parse( text );
	element.innerHTML = md_lines;
	}}
	function render_math( element_id ) {{
	let element = document.getElementById( element_id );
	// let text = element.innerHTML;
	// let escaped_text = replaceInString( text );
	// console.log( escaped_text );
	// element.textContent = text;

	console.log( element );
	renderMathInElement( element , {{
	strict: "ignore" ,
	delimiters: [
	{{ left: "$$" , right: "$$" , display: true }} ,
	{{ left: "$" , right: "$" , display: false }}
	],
	throwOnError : false
	}});

	}}
	function replaceInString(str) {{
	str = str.replace(/<[\\/]?pre[^>]*>/gi, "");
	str = str.replace(/<br\\s[\\/]?[^>]>/gi, "\\n");
	str = str.replace(/<div[^>]*>/gi, "\\n");
	// Thanks Graham A!
	str = str.replace(/<[\\/]?span[^>]*>/gi, "")
	str.replace(/<\\/div[^>]*>/g, "\\n");
	return replaceHTMLElementsInString( str );
	}}

	function replaceHTMLElementsInString(str) {{
	str = str.replace(/ /gi, " ");
	str = str.replace(/&tab;/gi, " ");
	str = str.replace(/>/gi, ">");
	str = str.replace(/</gi, "<");
	return str.replace(/&/gi, "&");
	}}

	function sleep( ms ) {{ return new Promise( resolve => setTimeout( resolve , ms ) ); }}

	function ready() {{
	return new Promise( function( resolve , reject ) {{
	try {{
	if (document.readyState === 'loading') {{ // If document is still loading, wait for it to complete
	document.addEventListener('DOMContentLoaded', resolve);
	}} else {{ // If document is already loaded, resolve immediately
	resolve();
	}}
	}}
	catch( error ) {{ console.log( error ); reject( error ); return; }}
	}});
	}}
	</script>
	</body>
	</html>
	"""

	page = get_page( browser )
	run_sync( page.setContent( html ) )
	time.sleep( 1 ) # probably good enough
	ready = run_sync( page.evaluate( '''async () => { await ready(); render_md( "source" ); render_math( "source" ); window.scrollTo( 0 ,document.body.scrollHeight ); await sleep( 500 ); return true }''' ) );
	run_sync( page.screenshot( { 'path': str( output_path ) , "fullPage": True , "quality": 100 } ) )
	run_sync( page.close() )
	auto_crop_image( str( output_path ) )
	miro_process_image( str( output_path ) )
	return

	if __name__ == "__main__":
	nest_asyncio.apply()

	# md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/General Chemistry" ) )
	md_files = get_md_files_in_base_path( Path( "/Users/morpheous/Library/CloudStorage/Dropbox/Notes/MCAT/CLEANED/Physics" ) )
	total_md_files = len( md_files )
	browser = get_browser()
	for index , file in enumerate( md_files ):
	print( f"[ { index + 1 } ] of {total_md_files}" )
	convert_md_file_to_jpeg( browser , file )
	run_sync( browser.close() )