Created
March 11, 2012 21:15
-
-
Save fkuehnel/2018232 to your computer and use it in GitHub Desktop.
Tagging Math Expressions in PDF documents with Lualatex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- element tree, http://etree.luaforge.net/ (is a bit buggy for the {decl = false} option) | |
local el = require "etree" | |
-- nodelist visualization (optional), https://gist.github.com/556247 | |
-- local viz = require "viznodelist" | |
function convertToMathML(head) | |
return {tag="not implemented"} | |
end | |
-- create content MathML for every math formula | |
luatexbase.add_to_callback('mlist_to_hlist', | |
function(head, display, penalty) | |
texio.write_nl('NEW mathlist') | |
result = convertToMathML(head) | |
if result ~= nil then | |
et = etree.ElementTree({tag = "math", result}, {decl = false}) | |
local pdf = node.new("whatsit", "pdf_annot") | |
local buffer = etree.StringBuffer() | |
et:write(buffer) | |
pdf.data = '/Subtype /MathML /Contents (' .. tostring(buffer) .. ')' | |
head = node.insert_before(head, head, pdf) | |
end | |
return node.mlist_to_hlist(head, display, penalty) | |
end, | |
"content MathML generator") | |
-- add content MathML as a PDF annotation | |
local whatsit = node.id('whatsit') | |
local hlist = node.id('hlist') | |
local vlist = node.id('vlist') | |
local math_node = node.id('math') | |
local function add_size_to_annot(head, hbox) | |
while head do | |
typ = head.id | |
if typ == vlist then | |
add_size_to_annot(head.head, hbox) | |
elseif typ == hlist then | |
add_size_to_annot(head.head, {width=head.width,height=head.height,depth=head.depth}) | |
elseif typ == whatsit and head.subtype == 15 and | |
string.sub(head.data, 1, 16) == '/Subtype /MathML' then | |
if head.prev ~= nil and head.prev.id == math_node and head.prev.subtype == 0 then | |
tail = head | |
for test_node in node.traverse_id(math_node, head.next) do | |
if test_node.subtype == 1 then | |
tail = test_node | |
break | |
end | |
end | |
w, h, d = node.dimensions(head.prev, tail) | |
hbox = {width=w,height=h,depth=d} | |
end | |
--texio.write_nl(string.format("add height %gpt, width %gpt, depth %gpt",hbox.height / 2^16, hbox.width / 2^16, hbox.depth / 2^16)) | |
head.width = hbox.width | |
head.height = hbox.height | |
head.depth = hbox.depth | |
else | |
-- texio.write_nl('found node '..node.type(head.id)) | |
end | |
head = head.next | |
end | |
end | |
local vpack_counter = 1 | |
luatexbase.add_to_callback('pre_output_filter', | |
function(head) | |
add_size_to_annot(head,{width=0,height=0,depth=0}) | |
-- viz.nodelist_visualize(head, "vpack"..vpack_counter..".gv") | |
vpack_counter = vpack_counter + 1 | |
return head | |
end | |
,"find math bounding box") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\pdfcompresslevel=0 % to make everything visible in the pdf | |
\documentclass{article} | |
\usepackage{amssymb} | |
\usepackage{luacode} | |
\directlua{dofile("mathmltagger.lua")} | |
%% a collection of Hans Hagens MathML examples, and some additions | |
\begin{document} | |
$$b \equiv b$$ | |
$$\sqrt[1+n]{64}$$ | |
$$1 + x \over 1 - x$$ | |
$$-1$$ | |
$$a+b+c$$ | |
$$x \ge 4$$ | |
$$a b$$ | |
$$ x\in\mathbb{N}$$ | |
$$ 1A2C_{16} + 0101_{16} = 1B2D_{16}$$ | |
$$ 2+5i\in\mathbb{C}$$ | |
%% eq, neq, gt, lt, geq, leq | |
$$ a\le b\le c$$ | |
%% equivalent, approx, implies | |
$$ a+b \equiv b+a $$ | |
$$ 3.14159 \approx \pi $$ | |
%% minus, plus | |
$$37 -x$$ | |
$$-37$$ | |
$$-x+37$$ | |
$$a+x$$ | |
%% times | |
$$3p$$ | |
%% divide | |
$$1-{1 \over 3}+{1\over 5}-{1\over 7}+\ldots = \frac{\pi}{4}$$ | |
$${-b - \sqrt{a} \over (b-b) -\sqrt{a}}$$ | |
%%$${-b - -b - \sqrt{a} \over (b-b)- -b -\sqrt{a}}$$ | |
%% power | |
$$x^2 + \sin^2 x$$ | |
%% root, degree | |
$$\sqrt[3]{64} = 4$$ | |
%% sin, cos, tan, cot, scs, sec, .. | |
$$\sin(x+y)=\sin x \cos y + \cos x \sin y$$ | |
$$\cos\pi = -1$$ | |
%% log, ln, exp | |
$$\ln(e+2)\approx 1.55$$ | |
$$e^2=7.3890560989307$$ %% is false! | |
%% quotient, rem | |
$$ \lfloor a/b \rfloor $$ | |
%% factorial | |
$$ n! = n\times(n-1)\times(n-2)\times\cdots\times 1$$ | |
%% min, max, gcd, lcm | |
$$z=\min\left\{(x+y),2x,{1\over y}\right\}$$ | |
%% and, or, xor, not | |
$$1001_2 0101_2=0001$$ | |
%% set, bvar | |
$$ \left\{1,4,8\right\}\neq$$ | |
$$ \left\{x | 2<x<\right\}$$ | |
%% list | |
$$[1,1,3]$$ | |
$$D_{1,1,3}f$$ | |
%% union, intersect, ... | |
$$U\cup V$$ | |
$$U\cap V$$ | |
$$v\in V$$ | |
$$u\notin V$$ | |
%% interval | |
$$(a,b]$$ | |
$$(a,x)$$ | |
%% inverse | |
$$ \sin^{-1}x$$ | |
%% sum, product, limit, lowlimit, uplimit, bvar | |
$$ \sum_{i=1}^{n} {1 \over x} $$ | |
$$ \prod_{i} {1 \over x}$$ | |
$$ \prod_{x\in\mathbb{R}}f(x)$$ | |
$$ \lim_{x\rightarrow 0}\sin x$$ | |
%% int, diff, partialdiff, bvar, degree | |
$${d \left(\int_p^q f(x,a)dx \right) \over da}$$ | |
$$f^\prime$$ | |
$${d^2f(x) \over dx^2}$$ | |
$${d^4f \over x df^2}$$ | |
$${d^kf(x,y) \over x df(x,y)^m}$$ | |
$${d^{m+n}f(x,y) \over x df(x,y)^m}$$ | |
%% fn | |
\end{document} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This example demonstrates how Lua(La)Tex could be used to create math expression annotations in PDF documents. Here, the purpose is to tag math expressions with a bounding box. Obviously, it would be quite valuable to annotate the (La)Tex math expressions with the proper Content MathML, however this would go far beyond the simple program snippets presented here.
My own experience is that for simple LaTex math formulas it is quite easy to generate the proper Content MathML equivalents. However the approach using Context Free Grammar parsers (i.e. lpeg) doesn't apply well to the breadth of LaTex documents for which the meaning of math expressions is rather context sensitive!