Skip to content

Instantly share code, notes, and snippets.

@mgymrek
Last active July 6, 2016 03:13
Show Gist options
  • Save mgymrek/8212635 to your computer and use it in GitHub Desktop.
Save mgymrek/8212635 to your computer and use it in GitHub Desktop.
A handy tool for automatically compiling figures and tables for a manuscript directly from an IPython notebook.
def usage():
print """
Usage: python CompileFiguresTables.py --figlist <FILE> --nb <FILE>[,<FILE>,<file...] --out <STRING>
This script compiles figures and tables with legends for a paper from an ipython notebook.
Main text figures are compiled to A4 sized PDFs, with a specified layout,
giving "A", "B", "C", etc. Figure legends and tables written to a .docx file.
Supplemental figures and tables are compiled to a .docx file, with one
figure/legend per page. Use specified layout for multiple panels.
Arguments:
--figlist: file with list of figures. JSON format with. Best explained by
the example given in example_fig_list.json. Briefly, it has:
MainText
Figures
Tables
Supplemental
Figures
Tables
Figures and Tables are lists of figure and table objects.
Figure format:
{
"FigureName": "name",
"FigureTitle": "title",
"SubFigures": [
"fig1",
"fig2",
"fig3",
...
],
"Layout": "<layout>"
}
"SubFigureName" and "Table" is given in the Ipython notebook file:
Cells with code for figures/tables have a comment "# FIGURE: <$SubFigureName|$Table>".
Figure cells should add to a pyplot axis called "ax".
Table cells should output a pandas dataframe.
To have an empty grid space, specify the empty string for the SubFigureName.
If a figure is huge when written to PDF, use $SubFigureName:png to make the
plot body displayed in png rather than pdf.
Cells with legends are in markdown format and have a title "### Legend: <$SubFigureName|$Table> ###".
If no legend is given the empty string is used
Layout is a format string giving grid: Examples:
A single figure: (1)
2x2 grid: (1,2),(3,4)
2x2 grid, first figure takes up whole top row: (1,1),(2,3)
3x1 grid: (1),(2),(3)
--nb: ipython notebook file. Can give comma separated list of files to compile multiple notebooks.
If using multiple notebook files, make sure variables are unique between them since code will be
loaded for all of them at once.
--out: output prefix. Write:
<out>.<FigureName>.pdf for each main text figure
<out>.maintext_legends_and_tables.docx: for main text figure legends
<out>.supplemental_figures_and_tables.docx: for supplemental figures and legends
<out>_supp_pdfs: directory pdfs for each supp figure
-h, --help: print this message
-v, --verbose: print helpful status messages
NOTES:
1. This runs by running all cells without "FIGURE" in them first, then producing all the figures.
Code needs to be able to run accordingly.
2. Assume 1 plt.Axes per figure, named "ax".
3. Currently doesn't allow magic functions
e.g.
python CompileFiguresTables.py \
--nb small-test.ipynb \
--out test \
--figlist example_fig_list.json
Wishlist:
deal with magics
set font and table size/styles for docx outputs
"""
import matplotlib
matplotlib.use('Agg') # don't break if not in X forwarding
from docx import *
import getopt
import itertools
import json
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import os
import pandas as pd
import PyPDF2
import random
import re
import sys
import time
######## utils ###########
def MakeTwoDigits(num):
if num < 10:
return "0%s"%num
else: return str(num)
def GetTime():
t = time.localtime()
return "%s/%s/%s:%s:%s"%(t.tm_mon, t.tm_mday, t.tm_year, MakeTwoDigits(t.tm_hour), MakeTwoDigits(t.tm_min))
def LOG(scriptName, message, fname=None):
msg = "[%s] %s %s\n"%(scriptName, GetTime(), message)
if fname:
f = open(fname, "a")
f.write(msg)
f.close()
sys.stderr.write(msg)
def CheckFileExists(fname):
if not os.path.exists(fname):
LOG(sname, "File or directory %s does not exist"%fname)
sys.exit(1)
########################
sname = "CompileFiguresTables.py"
NumberToLetter =["A","B","C","D","E","F","G","H","I","J"]
LETTERSIZE = (8.27, 11.69)
try:
opts, args = getopt.getopt(sys.argv[1:], "hv", ["help","verbose","figlist=","nb=","out="])
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
args = [item[0] for item in opts]
if ((not "--figlist" in args) or (not "--nb" in args) or (not "--out" in args)):
usage()
sys.exit(2)
# initialize variables
VERBOSE = False
FIGLIST_FILE = ""
NB_FILES = ""
OUT_PREFIX = ""
params = []
# set variables
for o, a in opts:
params.append("%s=%s"%(o.strip("-"),a))
if o == "--figlist":
FIGLIST_FILE = a
CheckFileExists(FIGLIST_FILE)
if o == "--out":
OUT_PREFIX = a
if o == "--nb":
NB_FILES = a.split(",")
for item in NB_FILES: CheckFileExists(item)
if o == "--help" or o == "-h":
usage()
sys.exit(0)
if o == "-v" or "--verbose":
VERBOSE = True
########################################
# functions
def ParseNB(nbfile):
"""
Inputs:
nbfile (string): path to ipython notebook
Return:
FigureToCode (dict:string->[string]): SubfigureName or Table ->code lines
FigureToLegend (dict:string->string): SubfigureName or Table->legend
SupportingCode [[string]]: list of code for each cell that is not a figure/table
"""
FigureToCode = {}
FigureToLegend = {}
SupportingCode = []
nb = json.load(open(nbfile, "r"))
cells = nb["worksheets"][0]["cells"]
for cell in cells:
if cell["cell_type"] == "code":
textlines = cell["input"]
figname = None
for item in textlines:
if re.match("#\s?FIGURE: .*", item):
figname = item.split("FIGURE:")[1].strip()
elif re.match("#\s?DISPLAY: .*", item):
figname = item.split("DISPLAY:")[1].strip()
if figname:
FigureToCode[figname] = textlines
else:
SupportingCode.append(textlines)
if cell["cell_type"] == "markdown":
textlines = cell["source"]
figname = None
text = ""
for item in textlines:
if re.match("### LEGEND: .* ###\n", item):
figname = item.split("LEGEND:")[1].split("###")[0].strip()
text = [item for item in textlines if "LEGEND" not in item]
if figname: FigureToLegend[figname] = "".join(text)
return FigureToCode, FigureToLegend, SupportingCode
def GetAllFigureNames(figlist):
"""
Input:
figlist (pandas.DataFrame returned by pandas.read_jason)
Return:
[string]: list of all SubFigureNames to process
"""
main_text_figs = list(itertools.chain.from_iterable([item["SubFigures"] for item in figlist.MainText["Figures"]]))
supp_figs = list(itertools.chain.from_iterable([item["SubFigures"] for item in figlist.Supplemental["Figures"]]))
return [item.split(":")[0] for item in main_text_figs + supp_figs]
def GetAllTableNames(figlist):
"""
Input:
figlist (pandas.DataFrame returned by pandas.read_jason)
Return:
[string]: list of all Tables to process
"""
main_text_tables = [item["Table"] for item in figlist.MainText["Tables"]]
supp_tables = [item["Table"] for item in figlist.Supplemental["Tables"]]
return main_text_tables + supp_tables
def ScaleToAxis(tick_positions, old_axis, new_axis):
"""
Scale ticks to new axis when using imshow to display png
Input:
tick_positions (np.array or list) from old axis
old_axis: (min,max) of old axis
new_axis: (min,max) of new axis
Return:
new_ticks (list): new tick positions scaled to new axis
"""
min_old, max_old = old_axis
width_old = max_old-min_old
min_new, max_new = new_axis
width_new = max_new-min_new
new_ticks = []
for t in tick_positions:
perc = (t-min_old)*1.0/width_old
new = min_new + perc*width_new
new_ticks.append(new)
return new_ticks
def GetFigureSpan(layout, fignum):
"""
Input:
layout ([[int]]) (list of list of ints): layout format array
fignum (int): number of the figure we're processing
Return:
from_row, to_row, from_col, to_col (int,int,int,int)
"""
rows = [i for i in range(len(layout)) if fignum in layout[i]]
from_row = min(rows)
to_row = max(rows)
if len(rows) != to_row-from_row + 1:
LOG(sname, "ERROR: invalid layout grid. Noncontiguous figure (row)")
sys.exit(1)
cols = [i for i in range(len(layout[rows[0]])) if layout[rows[0]][i]==fignum]
from_col = min(cols)
to_col = max(cols)
if len(cols) != to_col-from_col + 1:
LOG(sname, "ERROR: invalid layout grid. Noncontiguous figure (col)")
sys.exit(1)
for row in rows:
row = layout[row]
for i in range(len(row)):
if i >= from_col and i <= to_col:
if row[i] != fignum:
LOG(sname, "ERROR: invalid layout grid. Nongrid figure")
sys.exit(1)
else:
if row[i] == fignum:
LOG(sname, "ERROR: invalid layout grid. Nongrid figure")
sys.exit(1)
return from_row, to_row, from_col, to_col
def MakeFigure(figcode, layout, figpath, size=None, gl={}, pngs=[]):
"""
Main function to process figures.
Make subplots on layout. Save to figpath
Inputs:
figcode ([[string]]): list of list of lines of code to execute for each figure
layout (string): layout format string
figpath (string): path to save figure
size (int,int): width/height in iches. If None, save to letter size
gl: dictionary of global variables (from globals())
pngs [int]: list of figure numbers to make as pngs (because they're too big otherwise)
"""
# parse layout
layout = [map(int,item.strip(",").split(",")) for item in layout.replace("(","").split(")")[:-1]]
# check
numrows = len(layout)
numcols = [len(item) for item in layout]
if not numcols.count(numcols[0]) == len(numcols):
LOG(sname, "ERROR: invalid layout grid")
sys.exit(1)
numcols = numcols[0]
lf = set(itertools.chain.from_iterable(layout))
for i in range(1, len(figcode)+1):
if i not in lf:
LOG(sname, "ERROR: not enough positions specified in layout")
sys.exit(1)
# set up figure
plt.clf()
fig = plt.figure(1)
grid_width = 1.0/numcols
grid_height = grid_width # make them square
col_scale = 1
row_scale = 1
if numrows == 2 or numcols == 2:
row_scale = 0.8
col_scale = 0.8
if numrows == 3: row_scale = 0.7
if numcols == 3: col_scale = 0.7
fignum = 1
for i in range(len(figcode)):
figletter = NumberToLetter[fignum-1]
# get span
from_row, to_row, from_col, to_col = GetFigureSpan(layout, fignum)
# get letter label
if len(figcode) > 1 and len(figcode[i]) > 0:
ax = fig.add_axes([from_col*grid_width, 1-(from_row+1)*grid_height, grid_width, grid_height])
ax.set_axis_off()
ax.set_ylim(bottom=0, top=1)
ax.text(0,0.8,figletter, size=20, weight="bold")
colspan = (to_col-from_col+1)
rowspan = (to_row-from_row+1)
w = grid_width*(colspan-1)+grid_width*col_scale
h = grid_height*(rowspan-1)+grid_height*row_scale
ax = fig.add_axes([from_col*grid_width+(1-col_scale)*0.7*grid_width, 1-(to_row+1)*grid_height, w, h])
newcode = ""
for codeline in figcode[i]:
if "fig =" not in codeline and "fig=" not in codeline and \
"ax =" not in codeline and \
"set_size_inches" not in codeline:
newcode = newcode + codeline
if i in pngs:
fname = "/tmp/%s.png"%(random.randint(0,1000000))
# Make a new figure, which we'll save to png (only the non-axis part)
addcodelines = []
aftercodelines = []
addcodelines.append("ax_old = ax") # keep track of old axes
addcodelines.append("fig2 = plt.figure(2)") # new figure
addcodelines.append("ax = fig2.add_axes([0,0,w,h])") # new axes
aftercodelines.append("xticklabels = [t.get_text() for t in ax.get_xticklabels()]")
aftercodelines.append("yticklabels = [t.get_text() for t in ax.get_yticklabels()]")
aftercodelines.append("if xticklabels[0] == \"\": xticklabels = ax.get_xticks()")
aftercodelines.append("if yticklabels[0] == \"\": yticklabels = ax.get_yticks()")
aftercodelines.append("ax.set_axis_off()")
aftercodelines.append("ax.get_xaxis().set_visible(False)")
aftercodelines.append("ax.get_yaxis().set_visible(False)")
aftercodelines.append("plt.savefig(\"%s\", bbox_inches=\"tight\", pad_inches=0, dpi=500)"%fname) # save as png
aftercodelines.append("plt.close(2)")
aftercodelines.append("plt.figure(1)") # get back to figure 1
aftercodelines.append("ax_png = ax")
aftercodelines.append("ax = ax_old") # get back to the axis we want to plot
aftercodelines.append("img = mpimg.imread(\"%s\")"%fname)
aftercodelines.append("ax.imshow(img, extent=[0,1.1,0,1.1], interpolation=\"nearest\", aspect=\"equal\")")
# set the axis to how it should be
aftercodelines.append("ax.set_xlabel(ax_png.get_xlabel())")
aftercodelines.append("ax.set_ylabel(ax_png.get_ylabel())")
aftercodelines.append("ax.set_xticks(ScaleToAxis(ax_png.get_xticks(), ax_png.get_xlim(), ax.get_xlim()))")
aftercodelines.append("ax.set_yticks(ScaleToAxis(ax_png.get_yticks(), ax_png.get_ylim(), ax.get_ylim()))")
aftercodelines.append("ax.set_xticklabels(xticklabels, size=12)");
aftercodelines.append("ax.set_yticklabels(yticklabels, size=12)");
newcode = "\n".join(addcodelines) + "\n" + newcode + "\n" + "\n".join(aftercodelines)
if len(newcode) > 0:
newcode_comp = compile(newcode, "<string>", "exec")
exec(newcode_comp, gl, locals())
fignum = fignum + 1
else: ax.set_axis_off()
# set size
if size is None:
size = LETTERSIZE
pad = 0.42
fig.set_size_inches((size[0]-pad, (size[0]-pad)*numcols*1.0/numrows))
dpi = 500
else:
xPix = 400
dpi = xPix/size[0]
for p in figpath:
plt.savefig(p, bbox_inches="tight", pad_inches=0, dpi=dpi)
# if pdf and size is letter, change the paper size
if ".pdf" in p and size == LETTERSIZE:
pr = PyPDF2.PdfFileReader(open(p,"rb"))
page1 = pr.pages[0]
# extend the paper to letter size
mbox = page1.mediaBox
newh = (float(mbox[2])*LETTERSIZE[1]/LETTERSIZE[0])
deltaH = newh - float(mbox[3])
page1.mediaBox = PyPDF2.generic.RectangleObject([0,-1*deltaH,mbox[2],mbox[3]])
# write it
wr = PyPDF2.PdfFileWriter()
wr.addPage(page1)
wr.write(open(p+".tmp","wb"))
os.system("mv -f %s %s"%(p+".tmp",p))
def ProcessFigure(figdata, figpath, FigureToCode, FigureToLegend, size=None, gl={}):
"""
Process a figure and return the legend
Input:
figdata (pandas.DataFrame): item from "Figures" list in figlist
figpath (string): path to save figure to
FigureToCode (dict:string->[string]): code for each subfigure
FigureToLegend (dict:string->string): legend for each subfigure
size: (int,int): width/height of the figure in inches. If None, use letter size
gl: dictionary of global variables, from calling globals()
Return:
legend [(string, format)] formatted using docx style
"""
LOG(sname, " %s"%figdata["FigureTitle"])
subfigs = figdata["SubFigures"]
layout = figdata["Layout"]
legend = (figdata["FigureTitle"] + ". ", [])
figcode = []
pngs = []
for figname in subfigs:
if ":png" in figname:
pngs.append(subfigs.index(figname))
figname = figname.split(":")[0]
code = FigureToCode.get(figname, "")
legend[1].append(FigureToLegend.get(figname, "No legend"))
figcode.append(code)
legend_text = [(legend[0], 'b')]
fignum = 0
for item in legend[1]:
figletter = NumberToLetter[fignum]
if len(legend[1]) > 1:
legend_text.append((figletter+". ",'b'))
legend_text.append(item+" ")
fignum = fignum + 1
MakeFigure(figcode, layout, figpath, size=size, gl=gl, pngs=pngs)
return legend_text
def ConvertToString(val):
"""
Convert values to strings for table
Input:
val (object)
Return:
string
"""
try:
x = float(val)
return "{:.2g}".format(x)
except: return str(val)
def MakeTable(tablecode, gl={}):
"""
Main function to process tables
Input:
tablecode [string]: lines of code to create table, should return a pandas DataFrame
gl: global variables from calling globals()
Return:
[[string]]: list of rows for the table, will be processed by docx to make table
"""
comp = compile("".join(tablecode), "<string>", "exec")
exec(comp, gl, locals())
df = eval(tablecode[-1].strip(), gl, locals())
df_list = [list(df.columns)]
for i in range(df.shape[0]):
df_list.append(map(ConvertToString,list(df.iloc[i,:])))
return df_list
def ProcessTable(tabledata, FigureToCode, FigureToLegend, gl={}):
"""
Process a table and return the legend
Input:
tabledata (pandas.DataFrame): item from "Tables" list in figlist
FigureToCode (dict:string->[string]): code for each table
FigureToLegend (dict:string->string): legend for each table
gl: dictionary of global variables, from calling globals()
Return:
table [[string]]: list of rows for the table, will be processed by docx to make table
legend [(string, format)] formatted using docx style
"""
LOG(sname, " %s"%tabledata["TableTitle"])
legend = [(tabledata["TableTitle"] + ". ", 'b'), (FigureToLegend.get(tabledata["Table"],""))]
tablecode = FigureToCode[tabledata["Table"]]
table = MakeTable(tablecode, gl=gl)
return table, legend
########################################
# Set up MS word stuff
title = "Figures"
subject = "Figures"
creator = 'Melissa Gymrek'
keywords = []
coreprops = coreproperties(title=title, subject=subject, creator=creator,
keywords=keywords)
appprops = appproperties()
contenttypes = contenttypes()
websettings = websettings()
# Load figlist
if VERBOSE: LOG(sname, "Parsing figlist")
figlist = pd.read_json(FIGLIST_FILE)
# Load code and legend for each figure from Ipython notebooks
if VERBOSE: LOG(sname, "Parsing ipython notebokos")
FigureToCode = {}
FigureToLegend = {}
SupportingCode = []
for nbfile in NB_FILES:
a,b,c = ParseNB(nbfile)
FigureToCode.update(a)
FigureToLegend.update(b)
SupportingCode.extend(c)
# Check that we have everything we need (code and legends for all figures)
all_figure_names = GetAllFigureNames(figlist)
for fig in all_figure_names:
if fig not in FigureToCode:
LOG(sname, "WARNING: Figure %s has no code"%(fig))
if fig not in FigureToLegend:
LOG(sname, "WARNING: Figure %s has no legend"%(fig))
all_table_names = GetAllTableNames(figlist)
for tab in all_table_names:
if tab not in FigureToCode:
LOG(sname, "WARNING: Table %s has no code"%(tab))
if tab not in FigureToLegend:
LOG(sname, "WARNING: Table %s has no legend"%(tab))
# Run supporting code
if VERBOSE: LOG(sname, "Executing supporting code")
for cell in SupportingCode:
newcell = []
for line in cell:
if line[0] != "%": newcell.append(line)
code_comp = compile("".join(newcell), "<string>", "exec")
exec code_comp
# Process Main figures
if VERBOSE: LOG(sname, "Process main figures")
main_figs = figlist.MainText["Figures"]
main_tables = figlist.MainText["Tables"]
relationships = relationshiplist()
document = newdocument()
body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
for mf in main_figs:
legend_text = ProcessFigure(mf, ["%s.%s.pdf"%(OUT_PREFIX, mf["FigureName"])], FigureToCode, FigureToLegend, gl=globals())
body.append(heading(mf["FigureName"],2))
body.append(paragraph(legend_text))
# Process Main Tables
if VERBOSE: LOG(sname, "Process main tables")
if len(main_tables) > 0:
body.append(pagebreak(type="page", orient="portrait"))
tablenum = 1
for mt in main_tables:
tbl, legend = ProcessTable(mt, FigureToCode, FigureToLegend, gl=globals())
if tbl != []:
body.append(heading("Table %s"%tablenum, 1))
body.append(table(tbl))
body.append(paragraph(legend))
if mt != main_tables[-1]:
body.append(pagebreak(type="page", orient="portrait"))
tablenum = tablenum + 1
wr = wordrelationships(relationships)
savedocx(document, coreprops, appprops, contenttypes, websettings,
wr, "%s.maintext_legends_and_tables.docx"%OUT_PREFIX)
# Process Supplemental figures
if VERBOSE: LOG(sname, "Process supplemental figures")
relationships = relationshiplist()
document = newdocument()
body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
try:
os.mkdir("%s_supp_pdfs"%OUT_PREFIX)
except OSError: pass
supp_figs = figlist.Supplemental["Figures"]
supp_tables = figlist.Supplemental["Tables"]
fignum = 1
for sf in supp_figs:
figpath_pdf = "%s_supp_pdfs/%s.pdf"%(OUT_PREFIX, sf["FigureName"])
figpath_png = "%s.png"%(sf["FigureName"])
legend_text = ProcessFigure(sf, [figpath_png], FigureToCode, FigureToLegend, size=(8,4), gl=globals())
relationships, picpara = picture(relationships, figpath_png, sf["FigureName"])
body.append(heading("Supplemental Figure %s"%fignum, 1))
body.append(picpara)
body.append(paragraph(legend_text))
if (sf != supp_figs[-1]) or (sf == supp_figs[-1] and len(supp_tables) > 0):
body.append(pagebreak(type='page', orient='portrait'))
fignum = fignum + 1
cmd = "rm %s"%figpath_png
os.system(cmd)
# Process Supplemental tables
if VERBOSE: LOG(sname, "Process supplemental tables")
tablenum = 1
for st in supp_tables:
tbl, legend = ProcessTable(st, FigureToCode, FigureToLegend, gl=globals())
body.append(heading("Supplemental Table %s"%tablenum, 1))
body.append(table(tbl))
body.append(paragraph(legend))
if st != supp_tables[-1]:
body.append(pagebreak(type='page', orient='portrait'))
tablenum = tablenum + 1
wr = wordrelationships(relationships)
savedocx(document, coreprops, appprops, contenttypes, websettings,
wr, "%s.supplemental_figures_and_tables.docx"%OUT_PREFIX)
LOG(sname, "Done!")
{
"MainText": {
"Figures": [
{
"FigureName": "Figure1",
"FigureTitle": "TestFigure1",
"SubFigures": [
"fig1",
"fig2",
"fig1",
"fig2"
],
"Layout": "(1,2),(3,4)"
}
],
"Tables": []
},
"Supplemental": {
"Figures": [
{
"FigureName": "SuppFig1",
"FigureTitle": "SuppFig1Test",
"SubFigures": [
"fig2",
"fig1"
],
"Layout": "(1,2)"
},
{
"FigureName": "SuppFig2",
"FigureTitle": "SuppFig2Test",
"SubFigures": [
"fig1"
],
"Layout": "(1)"
}
],
"Tables": [
{
"TableName": "SuppTable1",
"TableTitle": "Testing tables",
"Table": "test-table"
}
]
}
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment