Last active
August 29, 2015 14:10
-
-
Save oncomouse/2e3aa2c538c39e7bddca to your computer and use it in GitHub Desktop.
Ruby script to process LaTeX files for conversion to .docx using Pandoc as a transport.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: | |
# => ruby pandoc.rb -i <file_name>(.tex) (options for pandoc) | |
# => minimum options for pandoc: --bibliography=<bibliography_file.bib> --csl=<mla or chicago>.csl | |
# Installed by running 'gem install bibtex' (may need to append sudo) | |
require 'bibtex' | |
default_bibliography = "./transhumanism.bib" # change this to whatever | |
if ARGV.find_index("-i") | |
file = ARGV[ARGV.find_index("-i") + 1] | |
options = {} | |
ARGV.delete("-i") | |
ARGV.delete(file) | |
else | |
raise "No Input File Provided" | |
exit | |
end | |
if options.has_key? "bibliography" | |
bibliography = BibTeX.open(options["bibliography"]) | |
else | |
bibliography = BibTeX.open(default_bibliography) | |
end | |
if file =~ /\.tex$/ | |
options["file_name_base"] = file.sub(/\.tex$/,"") | |
else | |
options["file_name_base"] = file | |
file = "#{file}.tex" | |
end | |
if !File.exists? file | |
raise "File #{file} Not Found" | |
exit | |
end | |
pandoc_options = ARGV.join(" ") | |
new_latex_content = "" | |
labels = {} | |
$chapter = 1 | |
figure = 1 | |
section = 1 | |
# Recursively scan in a file for labels | |
def scan_in_file(file_name, indent) | |
if File.exists? "./#{file_name}.tex" | |
include_file = File.read("./#{file_name}.tex") | |
figure = 1 | |
section = 1 | |
# Pull in other included files | |
include_file.gsub!(/^(\t*)\\input\{([^}]+)\}/) do |match| | |
scan_in_file($2, $1) | |
end | |
# Collect label information: | |
include_file.gsub!(/\t*\\([^{]*)\{(.+)\}\n\s*\\label\{(.*)\}/) do |match| | |
$labels[$3] = {} | |
if $1 == "caption" | |
$labels[$3]["number"] = "#{$chapter}.#{figure}" | |
figure += 1 | |
elsif $1 == "section" | |
$labels[$3]["number"] = "#{$chapter}.#{section}" | |
section += 1 | |
else | |
$labels[$3]["number"] = "#{$chapter}" | |
end | |
$labels[$3]["name"] = $2 | |
match.sub("\\label{#{$3}}","") | |
end | |
line = "\n#{indent}#{include_file}".gsub(/\n/,"\n#{indent}") | |
else | |
puts "Could not find #{file_name}.tex" | |
line = "" | |
end | |
line | |
end | |
File.open("#{options["file_name_base"]}.tex", "r") do |latex| | |
# Detect if the file we're processing is not in the same directory as the parser | |
file_base_dir = (File.dirname("#{options["file_name_base"]}.tex") == ".") ? ("") : File.dirname("#{options["file_name_base"]}.tex") + "/" | |
while line = latex.gets | |
if line =~ /^(\t*)\\ChapterInput\{([^}]+)\}/ or line =~ /^(\t*)\\input\{([^}]+)\}/ | |
line = scan_in_file(file_base_dir + $2, $1) | |
$chapter += 1 | |
end | |
new_latex_content += line | |
end | |
end | |
# Correctly insert label text | |
new_latex_content.gsub!(/\\(name){0,1}ref\{([^}]+)\}/) do |match| | |
if $1 == "name" and !$labels[$2].nil? | |
$labels[$2]["name"] | |
elsif !$labels[$2].nil? | |
$labels[$2]["number"] | |
else | |
"LABEL NOT FOUND" | |
end | |
end | |
#puts new_latex_content | |
# enumerate* and itemize* make pandoc freak out. | |
new_latex_content.gsub!(/(enumerate|itemize)\*/,"\\1") | |
# Look up titles | |
new_latex_content.gsub!(/\\citetitle\{([^}]+)\}/) do |match| | |
key = $1 | |
if bibliography[key].type == :book | |
"\\emph{#{bibliography[key].title}}" | |
else | |
"``#{bibliography[key].title}''" | |
end | |
end | |
# Fix wonkey citation format for pandoc-citeproc | |
new_latex_content.gsub!(/(citep|autocite|cite)\[([0-9, ]+)\]\{(\S+)\}/) do |match| | |
command = $1 | |
pages = $2 | |
cite_key = $3 | |
if pages =~ /\,/ | |
page_numbers = pages.split(/, */) | |
pages = page_numbers.map { |page| "p. #{page}" }.join(", ") | |
else | |
pages = "p. #{pages}" | |
end | |
"#{command}[#{pages}]{#{cite_key}}" | |
end | |
# Provide absolute file names for figures: | |
new_latex_content.gsub!(/(\t+)\\includegraphics(.*)\{([^}]+)\}/) do |match| | |
file_name = $3 | |
["png", "eps", "pdf", "jpg"].each do |extension| | |
if File.exists? "#{file_name}.#{extension}" | |
file_name = "#{file_name}.#{extension}" | |
end | |
end | |
"#{$1}\\includegraphics#{$2}{#{file_name}}" | |
end | |
# Fix epigraphs: | |
new_latex_content.gsub!(/(\t+)\\epigraph\{(.+)\}\{(.+)\}/, "\\1\\begin{center}\n\\1\t\\emph{\\2 \\linebreak[4] --- \\3}\n\\1\\end{center}") | |
File.open("#{options["file_name_base"]}-pandoc.tex", "w") do |pandoc| | |
pandoc.puts new_latex_content | |
end | |
exec "pandoc -s --smart #{options["file_name_base"]}-pandoc.tex -o #{options["file_name_base"]}.docx #{pandoc_options}" | |
#exec "pandoc -s -f latex -t markdown+citations #{options["file_name_base"]}-pandoc.tex -o #{options["file_name_base"]}.md #{pandoc_options}" | |
rm "#{options["file_name_base"]}-pandoc.tex" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment