Skip to content

Instantly share code, notes, and snippets.

@parj
Created April 2, 2026 13:26
Show Gist options
  • Select an option

  • Save parj/5e6bfd88710e8aed171a6394fba01b91 to your computer and use it in GitHub Desktop.

Select an option

Save parj/5e6bfd88710e8aed171a6394fba01b91 to your computer and use it in GitHub Desktop.
reader.py
# “””
xlsb_formula_reader.py
Pure-Python reader that extracts cell formulas from Excel Binary Workbook
(.xlsb) files. No third-party dependencies — only the Python standard library.
Implemented from:
• [MS-XLSB] Excel (.xlsb) Binary File Format
https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xlsb/
• Ptg token table from MS-XLSB specification §2.5.98.16
## Dispatch rules for Ptg bytes
Tokens 0x01–0x1F : exact byte match (operators, literals, control tokens).
Tokens 0x20–0x7F : three class variants per token type:
REF class = base_value (e.g. PtgRef = 0x24)
VAL class = base_value + 0x20 (e.g. PtgRef = 0x44)
ARR class = base_value + 0x40 (e.g. PtgRef = 0x64)
Dispatch on ptg & 0x1F (strips the two class bits) — but only when
ptg >= 0x20, because the same low-5-bit values are reused by operators
in the 0x03–0x14 range.
## Usage
```
python xlsb_formula_reader.py workbook.xlsb [sheet_name]
```
Or programmatically:
```
from xlsb_formula_reader import XlsbWorkbook, col_to_letter
with XlsbWorkbook("workbook.xlsb") as wb:
for sheet_name, formulas in wb.iter_formulas():
for (row, col), formula in sorted(formulas.items()):
print(f"{sheet_name}!{col_to_letter(col)}{row+1}: {formula}")
```
“””
import io
import os
import re
import struct
import zipfile
from typing import Dict, Iterator, List, Optional, Tuple
# —————————————————————————
# Helpers
# —————————————————————————
def col_to_letter(col: int) -> str:
“”“Convert 0-based column index to Excel letter(s), e.g. 0→A, 26→AA.”””
result = “”
col += 1
while col:
col, rem = divmod(col - 1, 26)
result = chr(65 + rem) + result
return result
def cell_ref(row: int, col: int) -> str:
return f”{col_to_letter(col)}{row + 1}”
def _fmt_num(d: float) -> str:
“”“Format a double like Excel (no unnecessary decimals).”””
if d == int(d) and abs(d) < 1e15:
return str(int(d))
return repr(d)
# —————————————————————————
# BIFF12 record stream reader
# —————————————————————————
class RecordReader:
“””
Iterates over BIFF12 variable-length records.
```
Each record: [type: 1–2 bytes varint] [size: 1–4 bytes varint] [data: size bytes]
Yields (record_type: int, data: bytes).
"""
def __init__(self, data: bytes):
self._buf = data
self._pos = 0
def _varint(self) -> int:
result = 0
shift = 0
for _ in range(4):
if self._pos >= len(self._buf):
raise EOFError
b = self._buf[self._pos]; self._pos += 1
result |= (b & 0x7F) << shift
shift += 7
if not (b & 0x80):
break
return result
def __iter__(self):
return self
def __next__(self) -> Tuple[int, bytes]:
if self._pos >= len(self._buf):
raise StopIteration
# record type
b0 = self._buf[self._pos]; self._pos += 1
if b0 & 0x80:
if self._pos >= len(self._buf):
raise StopIteration
b1 = self._buf[self._pos]; self._pos += 1
rec_type = (b0 & 0x7F) | ((b1 & 0x7F) << 7)
else:
rec_type = b0
# record size
size = self._varint()
# data
data = self._buf[self._pos: self._pos + size]
self._pos += size
return rec_type, data
```
# —————————————————————————
# Record type IDs (subset)
# —————————————————————————
BRT_ROW_HDR = 0x0000
BRT_CELL_BLANK = 0x0001
BRT_CELL_RK = 0x0002
BRT_CELL_ERROR = 0x0003
BRT_CELL_BOOL = 0x0004
BRT_CELL_REAL = 0x0005
BRT_CELL_ST = 0x0006
BRT_CELL_ISST = 0x0007
BRT_FMLA_STRING = 0x0008
BRT_FMLA_NUM = 0x0009
BRT_FMLA_BOOL = 0x000A
BRT_FMLA_ERROR = 0x000B
BRT_SST_ITEM = 0x0013
BRT_BEGIN_SST = 0x009F
BRT_END_SST = 0x00A0
BRT_BEGIN_SHEET = 0x0081
BRT_END_SHEET = 0x0082
BRT_BUNDLE_SH = 0x009C
# —————————————————————————
# Ptg exact-byte token constants (ptg < 0x20)
# —————————————————————————
PTG_EXP = 0x01
PTG_TBL = 0x02
PTG_ADD = 0x03
PTG_SUB = 0x04
PTG_MUL = 0x05
PTG_DIV = 0x06
PTG_POWER = 0x07
PTG_CONCAT = 0x08
PTG_LT = 0x09
PTG_LE = 0x0A
PTG_EQ = 0x0B
PTG_GE = 0x0C
PTG_GT = 0x0D
PTG_NE = 0x0E
PTG_ISECT = 0x0F
PTG_LIST = 0x10 # union operator
PTG_RANGE = 0x11
PTG_UPLUS = 0x12
PTG_UMINUS = 0x13
PTG_PERCENT = 0x14
PTG_PAREN = 0x15
PTG_MISSARG = 0x16
PTG_STR = 0x17
PTG_ATTR = 0x19
PTG_ERR = 0x1C
PTG_BOOL = 0x1D
PTG_INT = 0x1E
PTG_NUM = 0x1F
# —————————————————————————
# Class-variant base values (ptg & 0x1F when ptg >= 0x20)
#
# Each token exists in three variants:
# REF = _B_xxx (e.g. PtgRef = 0x24, _B_REF = 0x04)
# VAL = _B_xxx + 0x20 (e.g. PtgRefV = 0x44)
# ARR = _B_xxx + 0x40 (e.g. PtgRefA = 0x64)
# —————————————————————————
_B_ARRAY = 0x00 # 0x20/0x40/0x60
_B_FUNC = 0x01 # 0x21/0x41/0x61
_B_FUNC_VAR = 0x02 # 0x22/0x42/0x62
_B_NAME = 0x03 # 0x23/0x43/0x63
_B_REF = 0x04 # 0x24/0x44/0x64
_B_AREA = 0x05 # 0x25/0x45/0x65
_B_MEM_AREA = 0x06 # 0x26/0x46/0x66
_B_MEM_ERR = 0x07 # 0x27/0x47/0x67
_B_MEM_NOMEM = 0x08 # 0x28/0x48/0x68
_B_MEM_FUNC = 0x09 # 0x29/0x49/0x69
_B_REF_ERR = 0x0A # 0x2A/0x4A/0x6A
_B_AREA_ERR = 0x0B # 0x2B/0x4B/0x6B
_B_REF_N = 0x0C # 0x2C/0x4C/0x6C
_B_AREA_N = 0x0D # 0x2D/0x4D/0x6D
_B_NAME_X = 0x19 # 0x39/0x59/0x79
_B_REF_3D = 0x1A # 0x3A/0x5A/0x7A
_B_AREA_3D = 0x1B # 0x3B/0x5B/0x7B
_B_REF_ERR3D = 0x1C # 0x3C/0x5C/0x7C
_B_AREA_ERR3D= 0x1D # 0x3D/0x5D/0x7D
# PtgAttr sub-type flags
_ATTR_SEMI = 0x01
_ATTR_IF = 0x02
_ATTR_CHOOSE = 0x04
_ATTR_GOTO = 0x08
_ATTR_SUM = 0x10
_ATTR_BAXCEL = 0x20
_ATTR_SPACE = 0x40
# Error code → string
ERR_CODES = {
0x00: “#NULL!”, 0x07: “#DIV/0!”, 0x0F: “#VALUE!”,
0x17: “#REF!”, 0x1D: “#NAME?”, 0x24: “#NUM!”,
0x2A: “#N/A”, 0x2B: “#GETTING_DATA”,
}
# —————————————————————————
# Built-in function table iftab → name
# (Covers all common functions; from MS-XLSB §2.5.97.4)
# —————————————————————————
_FUNC: Dict[int, str] = {
0x0000:“COUNT”, 0x0001:“IF”, 0x0002:“ISNA”,
0x0003:“ISERROR”, 0x0004:“SUM”, 0x0005:“AVERAGE”,
0x0006:“MIN”, 0x0007:“MAX”, 0x0008:“ROW”,
0x0009:“COLUMN”, 0x000A:“NA”, 0x000B:“NPV”,
0x000C:“STDEV”, 0x000D:“DOLLAR”, 0x000E:“FIXED”,
0x000F:“SIN”, 0x0010:“COS”, 0x0011:“TAN”,
0x0012:“ATAN”, 0x0013:“PI”, 0x0014:“SQRT”,
0x0015:“EXP”, 0x0016:“LN”, 0x0017:“LOG10”,
0x0018:“ABS”, 0x0019:“INT”, 0x001A:“SIGN”,
0x001B:“ROUND”, 0x001C:“LOOKUP”, 0x001D:“INDEX”,
0x001E:“REPT”, 0x001F:“MID”, 0x0020:“LEN”,
0x0021:“VALUE”, 0x0022:“TRUE”, 0x0023:“FALSE”,
0x0024:“AND”, 0x0025:“OR”, 0x0026:“NOT”,
0x0027:“MOD”, 0x0028:“DCOUNT”, 0x0029:“DSUM”,
0x002A:“DAVERAGE”, 0x002B:“DMIN”, 0x002C:“DMAX”,
0x002D:“DSTDEV”, 0x002E:“VAR”, 0x002F:“DVAR”,
0x0030:“TEXT”, 0x0031:“LINEST”, 0x0032:“TREND”,
0x0033:“LOGEST”, 0x0034:“GROWTH”, 0x0038:“PV”,
0x0039:“FV”, 0x003A:“NPER”, 0x003B:“PMT”,
0x003C:“RATE”, 0x003D:“MIRR”, 0x003E:“IRR”,
0x003F:“RAND”, 0x0040:“MATCH”, 0x0041:“DATE”,
0x0042:“TIME”, 0x0043:“DAY”, 0x0044:“MONTH”,
0x0045:“YEAR”, 0x0046:“WEEKDAY”, 0x0047:“HOUR”,
0x0048:“MINUTE”, 0x0049:“SECOND”, 0x004A:“NOW”,
0x004B:“AREAS”, 0x004C:“ROWS”, 0x004D:“COLUMNS”,
0x004E:“OFFSET”, 0x0052:“SEARCH”, 0x0053:“TRANSPOSE”,
0x0056:“TYPE”, 0x0061:“ATAN2”, 0x0062:“ASIN”,
0x0063:“ACOS”, 0x0064:“CHOOSE”, 0x0065:“HLOOKUP”,
0x0066:“VLOOKUP”, 0x0069:“ISREF”, 0x006D:“LOG”,
0x006F:“CHAR”, 0x0070:“LOWER”, 0x0071:“UPPER”,
0x0072:“PROPER”, 0x0073:“LEFT”, 0x0074:“RIGHT”,
0x0075:“EXACT”, 0x0076:“TRIM”, 0x0077:“REPLACE”,
0x0078:“SUBSTITUTE”,0x0079:“CODE”, 0x007C:“FIND”,
0x007D:“CELL”, 0x007E:“ISERR”, 0x007F:“ISTEXT”,
0x0080:“ISNUMBER”, 0x0081:“ISBLANK”, 0x0082:“T”,
0x0083:“N”, 0x008C:“DATEVALUE”, 0x008D:“TIMEVALUE”,
0x008E:“SLN”, 0x008F:“SYD”, 0x0090:“DDB”,
0x0094:“INDIRECT”, 0x00BF:“STDEVP”, 0x00C0:“VARP”,
0x00C1:“DSTDEVP”, 0x00C2:“DVARP”, 0x00C3:“TRUNC”,
0x00C4:“ISLOGICAL”,0x00C5:“DCOUNTA”, 0x00C6:“CLEAN”,
0x00C7:“MDETERM”, 0x00C8:“MINVERSE”, 0x00C9:“MMULT”,
0x00CB:“IPMT”, 0x00CC:“PPMT”, 0x00CD:“COUNTA”,
0x00D7:“PRODUCT”, 0x00D8:“FACT”, 0x00DB:“DPRODUCT”,
0x00DC:“ISNONTEXT”,0x00DD:“GETPIVOTDATA”,0x00DE:“MEDIAN”,
0x00DF:“SUMPRODUCT”,0x00E0:“SINH”, 0x00E1:“COSH”,
0x00E2:“TANH”, 0x00E3:“ASINH”, 0x00E4:“ACOSH”,
0x00E5:“ATANH”, 0x00E6:“DGET”, 0x00EF:“INFO”,
0x00F2:“DB”, 0x00F9:“FREQUENCY”, 0x0100:“DAYS360”,
0x0101:“TODAY”, 0x0102:“VDB”, 0x0109:“ERRORTYPE”,
0x010B:“WORKDAY”, 0x010C:“NETWORKDAYS”,0x010D:“WEEKNUM”,
0x010E:“FLOOR”, 0x010F:“CEILING”, 0x0110:“ISEVEN”,
0x0111:“ISODD”, 0x0112:“MROUND”, 0x0113:“QUOTIENT”,
0x0114:“GCD”, 0x0115:“LCM”, 0x0116:“MULTINOMIAL”,
0x0117:“COMBIN”, 0x0118:“PERMUT”, 0x011A:“CONFIDENCE”,
0x011C:“EVEN”, 0x011D:“EXPONDIST”, 0x011E:“FDIST”,
0x011F:“FINV”, 0x0120:“FISHER”, 0x0121:“FISHERINV”,
0x0123:“GAMMADIST”,0x0124:“GAMMAINV”, 0x0126:“HYPGEOMDIST”,
0x0127:“LOGINV”, 0x0128:“LOGNORMDIST”,0x0129:“NEGBINOMDIST”,
0x012A:“NORMDIST”, 0x012B:“NORMSDIST”, 0x012C:“NORMINV”,
0x012D:“NORMSINV”, 0x012E:“STANDARDIZE”,0x012F:“ODD”,
0x0131:“POISSON”, 0x0132:“TDIST”, 0x0133:“WEIBULL”,
0x0134:“SUMXMY2”, 0x0135:“SUMX2MY2”, 0x0136:“SUMX2PY2”,
0x0137:“CHITEST”, 0x0138:“CORREL”, 0x0139:“COVAR”,
0x013A:“FORECAST”, 0x013B:“FTEST”, 0x013C:“INTERCEPT”,
0x013D:“PEARSON”, 0x013E:“RSQ”, 0x013F:“STEYX”,
0x0140:“SLOPE”, 0x0141:“TTEST”, 0x0142:“PROB”,
0x0143:“DEVSQ”, 0x0144:“GEOMEAN”, 0x0145:“HARMEAN”,
0x0146:“SUMSQ”, 0x0147:“KURT”, 0x0148:“SKEW”,
0x0149:“ZTEST”, 0x014A:“LARGE”, 0x014B:“SMALL”,
0x014C:“QUARTILE”, 0x014D:“PERCENTILE”,0x014E:“PERCENTRANK”,
0x014F:“MODE”, 0x0150:“TRIMMEAN”, 0x0151:“TINV”,
0x0157:“CONCATENATE”,0x0158:“POWER”, 0x015A:“RADIANS”,
0x015B:“DEGREES”, 0x015C:“SUBTOTAL”, 0x015D:“SUMIF”,
0x015E:“COUNTIF”, 0x015F:“COUNTBLANK”,0x0162:“ROMAN”,
0x0163:“HYPERLINK”,0x0164:“AVERAGEA”, 0x0165:“MAXA”,
0x0166:“MINA”, 0x0167:“STDEVPA”, 0x0168:“VARPA”,
0x0169:“STDEVA”, 0x016A:“VARA”, 0x0176:“RTD”,
0x017B:“HEX2BIN”, 0x017C:“HEX2DEC”, 0x017D:“HEX2OCT”,
0x017E:“DEC2BIN”, 0x017F:“DEC2HEX”, 0x0180:“DEC2OCT”,
0x0181:“OCT2BIN”, 0x0182:“OCT2DEC”, 0x0183:“OCT2HEX”,
0x0184:“BIN2DEC”, 0x0185:“BIN2OCT”, 0x0186:“BIN2HEX”,
0x0187:“IMSUB”, 0x0188:“IMDIV”, 0x0189:“IMPOWER”,
0x018A:“IMABS”, 0x018B:“IMSQRT”, 0x018C:“IMLN”,
0x018D:“IMLOG2”, 0x018E:“IMLOG10”, 0x018F:“IMSIN”,
0x0190:“IMCOS”, 0x0191:“IMEXP”, 0x0192:“IMARGUMENT”,
0x0193:“IMCONJUGATE”,0x0194:“IMAGINARY”,0x0195:“IMREAL”,
0x0196:“COMPLEX”, 0x0197:“IMSUM”, 0x0198:“IMPRODUCT”,
0x0199:“SERIESSUM”,0x019A:“FACTDOUBLE”,0x019B:“SQRTPI”,
0x019D:“DELTA”, 0x019E:“GESTEP”, 0x01A2:“ERF”,
0x01A3:“ERFC”, 0x01A4:“BESSELJ”, 0x01A5:“BESSELK”,
0x01A6:“BESSELY”, 0x01A7:“BESSELI”, 0x01A8:“XIRR”,
0x01A9:“XNPV”, 0x01AA:“PRICEMAT”, 0x01AB:“YIELDMAT”,
0x01AC:“INTRATE”, 0x01AD:“RECEIVED”, 0x01AE:“DISC”,
0x01AF:“PRICEDISC”,0x01B0:“YIELDDISC”, 0x01B1:“TBILLEQ”,
0x01B2:“TBILLPRICE”,0x01B3:“TBILLYIELD”,0x01B4:“PRICE”,
0x01B5:“YIELD”, 0x01B6:“DOLLARDE”, 0x01B7:“DOLLARFR”,
0x01B8:“NOMINAL”, 0x01B9:“EFFECT”, 0x01BA:“CUMPRINC”,
0x01BB:“CUMIPMT”, 0x01BC:“EDATE”, 0x01BD:“EOMONTH”,
0x01BE:“YEARFRAC”, 0x01BF:“COUPDAYBS”, 0x01C0:“COUPDAYS”,
0x01C1:“COUPDAYSNC”,0x01C2:“COUPNCD”, 0x01C3:“COUPNUM”,
0x01C4:“COUPPCD”, 0x01C5:“DURATION”, 0x01C6:“MDURATION”,
0x01C7:“ODDLPRICE”,0x01C8:“ODDLYIELD”, 0x01C9:“ODDFPRICE”,
0x01CA:“ODDFYIELD”,0x01CB:“RANDBETWEEN”,0x01CC:“WEEKNUM”,
0x01CD:“AMORDEGRC”,0x01CE:“AMORLINC”, 0x01CF:“CONVERT”,
0x01D0:“ACCRINT”, 0x01D1:“ACCRINTM”, 0x01D2:“WORKDAY”,
0x01D3:“NETWORKDAYS”,0x01D4:“GCD”, 0x01D5:“MULTINOMIAL”,
0x01D6:“LCM”, 0x01D7:“FVSCHEDULE”,0x01D8:“CUBESETCOUNT”,
0x01D9:“CUBESET”, 0x01DA:“IFERROR”, 0x01DB:“COUNTIFS”,
0x01DC:“SUMIFS”, 0x01DD:“AVERAGEIF”, 0x01DE:“AVERAGEIFS”,
0x01DF:“AGGREGATE”,0x0213:“IFNA”, 0x0214:“IFS”,
0x0215:“SWITCH”, 0x0216:“MAXIFS”, 0x0217:“MINIFS”,
0x0241:“XLOOKUP”, 0x0244:“XMATCH”, 0x0246:“FILTER”,
0x0247:“SORT”, 0x0248:“SORTBY”, 0x0249:“UNIQUE”,
0x024A:“SEQUENCE”, 0x024B:“RANDARRAY”, 0x024C:“LET”,
}
def _func_name(iftab: int) -> str:
return _FUNC.get(iftab, f”_func{iftab:#06x}”)
# Fixed arity for PtgFunc tokens
_FIXED_ARITY: Dict[int, int] = {
0x0003:1, 0x000A:0, 0x000F:1, 0x0010:1, 0x0011:1, 0x0012:1,
0x0013:0, # PI
0x0014:1, 0x0015:1, 0x0016:1, 0x0017:1, 0x0018:1, 0x0019:1,
0x001A:1, 0x001B:2, 0x0022:0, 0x0023:0, 0x0026:1, 0x0027:2,
0x003F:0, 0x0041:3, 0x0042:3, 0x0043:1, 0x0044:1, 0x0045:1,
0x0046:1, 0x0047:1, 0x0048:1, 0x0049:1, 0x004A:0, 0x004B:1,
0x004C:1, 0x004D:1, 0x0061:2, 0x0062:1, 0x0063:1, 0x006D:2,
0x006F:1, 0x0070:1, 0x0071:1, 0x0072:1, 0x0075:2, 0x0076:1,
0x007E:1, 0x007F:1, 0x0080:1, 0x0081:1, 0x0082:1, 0x0083:1,
0x00C3:1, 0x00C4:1, 0x00C6:1, 0x00D8:1, 0x0100:2, 0x0101:0,
0x010E:2, 0x010F:2, 0x011C:1, 0x012F:1, 0x0157:2, 0x0158:2,
0x015A:1, 0x015B:1,
}
# —————————————————————————
# Helpers
# —————————————————————————
def _sheet_prefix(sheet_names: List[str], ixti: int) -> str:
if 0 <= ixti < len(sheet_names):
name = sheet_names[ixti]
if any(c in name for c in “ ![]’”):
name = f”’{name}’”
return f”{name}!”
return f”[sheet{ixti}]!”
def _read_rgce_loc(buf: io.BytesIO) -> Tuple[int, int, bool, bool]:
“”“RgceLoc: 4-byte signed row + 2-byte column-flags word.”””
row = struct.unpack(”<i”, buf.read(4))[0]
col_raw = struct.unpack(”<H”, buf.read(2))[0]
col = col_raw & 0x3FFF
col_rel = bool(col_raw & 0x4000)
row_rel = bool(col_raw & 0x8000)
return row, col, row_rel, col_rel
def _loc_str(row: int, col: int, row_rel: bool, col_rel: bool) -> str:
c = (”” if col_rel else “$”) + col_to_letter(col)
r = (”” if row_rel else “$”) + str(row + 1)
return c + r
# —————————————————————————
# Ptg decompiler
# —————————————————————————
class _Decompiler:
“””
Stack-based RPN → infix formula decompiler.
```
Two-phase dispatch:
ptg < 0x20 → exact-byte (operators, literals, control tokens)
ptg >= 0x20 → operand/function tokens, dispatched on ptg & 0x1F
"""
def __init__(self, rgce: bytes,
sheet_names: Optional[List[str]] = None,
defined_names: Optional[Dict[int, str]] = None):
self._buf = io.BytesIO(rgce)
self._end = len(rgce)
self._sheets = sheet_names or []
self._dnames = defined_names or {}
def _u8(self) -> int: return struct.unpack("<B", self._buf.read(1))[0]
def _u16(self) -> int: return struct.unpack("<H", self._buf.read(2))[0]
def _i32(self) -> int: return struct.unpack("<i", self._buf.read(4))[0]
def _u32(self) -> int: return struct.unpack("<I", self._buf.read(4))[0]
def _f64(self) -> float: return struct.unpack("<d", self._buf.read(8))[0]
def _wstr16(self) -> str:
cch = self._u16()
return self._buf.read(cch * 2).decode("utf-16-le", errors="replace")
@staticmethod
def _pop(stack: List[str]) -> str:
return stack.pop() if stack else "?"
@staticmethod
def _pop_n(stack: List[str], n: int) -> List[str]:
args: List[str] = []
for _ in range(n):
args.insert(0, stack.pop() if stack else "?")
return args
def decompile(self) -> str:
stack: List[str] = []
while self._buf.tell() < self._end:
ptg = self._u8()
# ================================================================
# Phase 1: exact-byte tokens (ptg < 0x20)
# ================================================================
if ptg < 0x20:
_BINOP = {
PTG_ADD:"+", PTG_SUB:"-", PTG_MUL:"*", PTG_DIV:"/",
PTG_POWER:"^", PTG_CONCAT:"&",
PTG_LT:"<", PTG_LE:"<=", PTG_EQ:"=",
PTG_GE:">=", PTG_GT:">", PTG_NE:"<>",
PTG_RANGE:":",
}
if ptg in _BINOP:
right = self._pop(stack); left = self._pop(stack)
stack.append(f"{left}{_BINOP[ptg]}{right}")
elif ptg == PTG_ISECT:
right = self._pop(stack); left = self._pop(stack)
stack.append(f"{left} {right}")
elif ptg == PTG_LIST:
right = self._pop(stack); left = self._pop(stack)
stack.append(f"({left},{right})")
elif ptg == PTG_UPLUS:
stack.append(f"+{self._pop(stack)}")
elif ptg == PTG_UMINUS:
stack.append(f"-{self._pop(stack)}")
elif ptg == PTG_PERCENT:
stack.append(f"{self._pop(stack)}%")
elif ptg == PTG_PAREN:
stack.append(f"({self._pop(stack)})")
elif ptg == PTG_MISSARG:
stack.append("")
elif ptg == PTG_STR:
stack.append(f'"{self._wstr16()}"')
elif ptg == PTG_ATTR:
atype = self._u8()
if atype & _ATTR_SUM:
self._u16() # skip size word
stack.append(f"SUM({self._pop(stack)})")
elif atype & _ATTR_CHOOSE:
ncases = self._u16()
for _ in range(ncases + 1):
self._u16() # offset array
elif atype & _ATTR_SPACE:
self._u8(); self._u8() # space type + count
else:
self._u16() # GOTO/IF/SEMI/BAXCEL offset
elif ptg == PTG_ERR:
stack.append(ERR_CODES.get(self._u8(), "#ERR"))
elif ptg == PTG_BOOL:
stack.append("TRUE" if self._u8() else "FALSE")
elif ptg == PTG_INT:
stack.append(str(self._u16()))
elif ptg == PTG_NUM:
stack.append(_fmt_num(self._f64()))
elif ptg == PTG_EXP:
row = self._i32(); col = self._u16()
stack.append(f"{{array@{cell_ref(row,col)}}}")
elif ptg == PTG_TBL:
row = self._i32(); col = self._u16()
stack.append(f"TABLE({cell_ref(row,col)})")
else:
stack.append(f"?ptg{ptg:#04x}")
# ================================================================
# Phase 2: class-variant tokens (ptg >= 0x20)
# Dispatch on ptg & 0x1F
# ================================================================
else:
base = ptg & 0x1F
if base == _B_ARRAY:
self._buf.read(7) # 7 placeholder bytes
stack.append("{array}")
elif base == _B_FUNC:
iftab = self._u16()
name = _func_name(iftab)
arity = _FIXED_ARITY.get(iftab)
args = (self._pop_n(stack, arity) if arity is not None
else [self._pop(stack)])
stack.append(f"{name}({','.join(args)})")
elif base == _B_FUNC_VAR:
cparams_raw = self._u8()
is_ce = bool(cparams_raw & 0x80)
cparams = cparams_raw & 0x7F
iftab = self._u16()
name = (_func_name(iftab) if not is_ce
else f"_xlfn.{_func_name(iftab)}")
args = self._pop_n(stack, cparams)
stack.append(f"{name}({','.join(args)})")
elif base == _B_NAME:
nameindex = self._u32()
stack.append(self._dnames.get(nameindex,
f"name{nameindex}"))
elif base == _B_REF:
row, col, rr, cr = _read_rgce_loc(self._buf)
stack.append(_loc_str(row, col, rr, cr))
elif base == _B_AREA:
r1,c1,r1r,c1r = _read_rgce_loc(self._buf)
r2,c2,r2r,c2r = _read_rgce_loc(self._buf)
stack.append(
f"{_loc_str(r1,c1,r1r,c1r)}:{_loc_str(r2,c2,r2r,c2r)}"
)
elif base in (_B_MEM_AREA, _B_MEM_ERR,
_B_MEM_NOMEM, _B_MEM_FUNC):
self._buf.read(4); self._u16() # 4 reserved + cce
elif base == _B_REF_ERR:
self._buf.read(6)
stack.append("#REF!")
elif base == _B_AREA_ERR:
self._buf.read(12)
stack.append("#REF!:#REF!")
elif base == _B_REF_N:
row, col, rr, cr = _read_rgce_loc(self._buf)
stack.append(_loc_str(row, col, rr, cr))
elif base == _B_AREA_N:
r1,c1,r1r,c1r = _read_rgce_loc(self._buf)
r2,c2,r2r,c2r = _read_rgce_loc(self._buf)
stack.append(
f"{_loc_str(r1,c1,r1r,c1r)}:{_loc_str(r2,c2,r2r,c2r)}"
)
elif base == _B_NAME_X:
self._u16() # ixti
nameindex = self._u32()
stack.append(self._dnames.get(nameindex,
f"name{nameindex}"))
elif base == _B_REF_3D:
ixti = self._u16()
row, col, rr, cr = _read_rgce_loc(self._buf)
pfx = _sheet_prefix(self._sheets, ixti)
stack.append(f"{pfx}{_loc_str(row, col, rr, cr)}")
elif base == _B_AREA_3D:
ixti = self._u16()
r1,c1,r1r,c1r = _read_rgce_loc(self._buf)
r2,c2,r2r,c2r = _read_rgce_loc(self._buf)
pfx = _sheet_prefix(self._sheets, ixti)
stack.append(
f"{pfx}"
f"{_loc_str(r1,c1,r1r,c1r)}:{_loc_str(r2,c2,r2r,c2r)}"
)
elif base == _B_REF_ERR3D:
self._buf.read(2 + 6)
stack.append("#REF!")
elif base == _B_AREA_ERR3D:
self._buf.read(2 + 12)
stack.append("#REF!")
else:
stack.append(f"?ptg{ptg:#04x}")
break
return stack[-1] if stack else ""
```
# —————————————————————————
# Shared-string table
# —————————————————————————
def _read_sst(data: bytes) -> List[str]:
strings: List[str] = []
for rec_type, payload in RecordReader(data):
if rec_type == BRT_SST_ITEM:
if len(payload) < 5:
strings.append(””); continue
buf = io.BytesIO(payload)
buf.read(1) # flags byte
cch = struct.unpack(”<I”, buf.read(4))[0]
strings.append(
buf.read(cch * 2).decode(“utf-16-le”, errors=“replace”)
)
return strings
# —————————————————————————
# Workbook part (xl/workbook.bin)
# —————————————————————————
def _read_workbook(data: bytes) -> List[Tuple[str, str]]:
“”“Return [(sheet_name, rel_id), …] in tab order.”””
sheets: List[Tuple[str, str]] = []
for rec_type, payload in RecordReader(data):
if rec_type == BRT_BUNDLE_SH and len(payload) >= 8:
buf = io.BytesIO(payload)
buf.read(8) # hsState + iTabID
cch_rel = struct.unpack(”<I”, buf.read(4))[0]
rel_id = buf.read(cch_rel * 2).decode(“utf-16-le”, errors=“replace”)
cch_nam = struct.unpack(”<I”, buf.read(4))[0]
name = buf.read(cch_nam * 2).decode(“utf-16-le”, errors=“replace”)
sheets.append((name, rel_id))
return sheets
# —————————————————————————
# Relationships
# —————————————————————————
def _read_rels(xml_data: bytes) -> Dict[str, str]:
text = xml_data.decode(“utf-8”, errors=“replace”)
return {
m.group(1): m.group(2)
for m in re.finditer(
r’<Relationship[^>]+Id=”([^”]+)”[^>]+Target=”([^”]+)”’, text
)
}
# —————————————————————————
# Worksheet parser
# —————————————————————————
_FMLA_RECS = {BRT_FMLA_STRING, BRT_FMLA_NUM, BRT_FMLA_BOOL, BRT_FMLA_ERROR}
def _parse_worksheet(
data: bytes,
sheet_names: List[str],
sst: List[str],
defined_names: Optional[Dict[int, str]] = None,
) -> Dict[Tuple[int, int], str]:
“””
Parse a worksheet .bin part.
Returns {(row, col): “=formula_string”}. Row and col are 0-based.
“””
formulas: Dict[Tuple[int, int], str] = {}
current_row = 0
dn = defined_names or {}
```
for rec_type, payload in RecordReader(data):
if rec_type == BRT_ROW_HDR:
if len(payload) >= 4:
current_row = struct.unpack_from("<I", payload, 0)[0]
elif rec_type in _FMLA_RECS:
if len(payload) < 10:
continue
buf = io.BytesIO(payload)
# col (4 bytes) + style ref (4 bytes)
col = struct.unpack("<I", buf.read(4))[0]
buf.read(4)
# Skip cached result
if rec_type == BRT_FMLA_STRING:
cch = struct.unpack("<I", buf.read(4))[0]
buf.read(cch * 2)
elif rec_type == BRT_FMLA_NUM:
buf.read(8)
elif rec_type in (BRT_FMLA_BOOL, BRT_FMLA_ERROR):
buf.read(1)
# 2-byte formula flags
buf.read(2)
# CellParsedFormula: 4-byte cce + cce bytes rgce
cce_raw = buf.read(4)
if len(cce_raw) < 4:
continue
cce = struct.unpack("<I", cce_raw)[0]
if cce == 0 or cce > 16384:
continue
rgce = buf.read(cce)
if len(rgce) < cce:
continue
try:
formula = "=" + _Decompiler(
rgce, sheet_names=sheet_names, defined_names=dn
).decompile()
except Exception as exc:
formula = f"=<parse_error:{exc}>"
formulas[(current_row, col)] = formula
return formulas
```
# —————————————————————————
# Public API
# —————————————————————————
class XlsbWorkbook:
“””
Open an .xlsb workbook and iterate its cell formulas.
```
Parameters
----------
path : str or path-like
Path to the .xlsb file.
Example
-------
>>> with XlsbWorkbook("data.xlsb") as wb:
... for sheet, formulas in wb.iter_formulas():
... for (row, col), f in sorted(formulas.items()):
... print(f"{sheet}!{col_to_letter(col)}{row+1}: {f}")
"""
def __init__(self, path: "os.PathLike"):
self._zf = zipfile.ZipFile(str(path), "r")
self._sst: List[str] = []
self._sheets: List[Tuple[str, str]] = []
self._paths: Dict[str, str] = {}
self._dnames: Dict[int, str] = {}
self._init_workbook()
self._init_sst()
def _read_part(self, path: str) -> bytes:
path = path.replace("\\", "/").lstrip("/")
try:
return self._zf.read(path)
except KeyError:
lo = path.lower()
for n in self._zf.namelist():
if n.lower() == lo:
return self._zf.read(n)
raise FileNotFoundError(f"Not found in XLSB zip: {path}")
def _init_workbook(self):
self._sheets = _read_workbook(self._read_part("xl/workbook.bin"))
try:
rels = _read_rels(self._read_part("xl/_rels/workbook.bin.rels"))
except FileNotFoundError:
rels = {}
for _name, rel_id in self._sheets:
target = rels.get(rel_id, "")
if target:
full = (f"xl/{target}" if not target.startswith("xl/")
else target)
self._paths[rel_id] = full.replace("//", "/")
def _init_sst(self):
try:
self._sst = _read_sst(self._read_part("xl/sharedStrings.bin"))
except FileNotFoundError:
self._sst = []
@property
def sheet_names(self) -> List[str]:
"""Ordered list of worksheet names."""
return [n for n, _ in self._sheets]
def iter_formulas(
self,
) -> Iterator[Tuple[str, Dict[Tuple[int, int], str]]]:
"""
Yield ``(sheet_name, formulas)`` for every sheet.
``formulas`` maps ``(row, col)`` → formula string (starts with ``=``).
Row and col are **0-based**.
"""
all_names = self.sheet_names
for sheet_name, rel_id in self._sheets:
zpath = self._paths.get(rel_id)
if not zpath:
yield sheet_name, {}; continue
try:
ws_data = self._read_part(zpath)
except FileNotFoundError:
yield sheet_name, {}; continue
yield sheet_name, _parse_worksheet(
ws_data, all_names, self._sst, self._dnames
)
def close(self):
self._zf.close()
def __enter__(self): return self
def __exit__(self, *_): self.close()
```
# —————————————————————————
# CLI
# —————————————————————————
def main():
import sys
if len(sys.argv) < 2:
print(“Usage: python xlsb_formula_reader.py <file.xlsb> [sheet_name]”)
sys.exit(1)
```
path = sys.argv[1]
filter_sheet = sys.argv[2] if len(sys.argv) > 2 else None
with XlsbWorkbook(path) as wb:
print(f"Sheets: {', '.join(wb.sheet_names)}\n")
found = False
for sheet_name, formulas in wb.iter_formulas():
if filter_sheet and sheet_name != filter_sheet:
continue
if not formulas:
continue
found = True
print(f"=== {sheet_name} ===")
for (row, col), formula in sorted(formulas.items()):
print(f" {col_to_letter(col)}{row + 1}: {formula}")
print()
if not found:
print("(no formulas found)")
```
if **name** == “**main**”:
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment