Skip to content

Instantly share code, notes, and snippets.

@thekid
Last active December 8, 2023 19:54
Show Gist options
  • Save thekid/dfb9955790d451111564c1f3c75f7d9c to your computer and use it in GitHub Desktop.
Save thekid/dfb9955790d451111564c1f3c75f7d9c to your computer and use it in GitHub Desktop.
Convert Office Documents to Markdown
<?php
class Bold extends Container {
public function emit($emit, ... $args) {
return $emit->bold($this, ...$args);
}
}
<?php
class Cell extends Container {
public function fold(): parent {
if (1 === sizeof($this->contents) && $this->contents[0] instanceof parent) {
return $this->contents[0];
} else {
return $this;
}
}
public function emit($emit, ... $args) {
return $emit->cell($this, ...$args);
}
}
<?php
use io\streams\InputStream;
use util\address\{XmlStreaming, ValueOf};
/** @see https://support.microsoft.com/en-us/office/number-format-codes-5026bbd6-04bc-48cd-bf33-80f18b4eae68 */
class CellStyles {
public $cellXfs= [], $numFmts= [];
public function read(InputStream $in) {
$stream= new XmlStreaming($in);
$it= $stream->getIterator(true);
while ($it->valid()) {
switch ($it->key()) {
case '//numFmts/numFmt':
$numFmt= $stream->value(new ValueOf([], [
'@numFmtId' => fn(&$self) => $self['id']= yield,
'@formatCode' => fn(&$self) => $self['format']= yield,
]));
$this->numFmts[$numFmt['id']]= new NumberFormat($numFmt['format']);
break;
case '//cellXfs/xf/@numFmtId':
$this->cellXfs[]= $it->current();
break;
default: // Ignore
}
$it->next();
}
}
}
<?php
abstract class Container extends Element {
public $contents;
public function __construct(string|array|parent $arg= []) {
if ($arg instanceof parent) {
$this->contents= [$arg];
} else if (is_array($arg)) {
$this->contents= $arg;
} else {
$this->contents= [new Text($arg)];
}
}
public function add(parent $content) {
$this->contents[]= $content;
return true;
}
public function toString() { return nameof($this).'@'.Objects::stringOf($this->contents); }
}
<?php
use io\streams\InputStream;
use util\Comparison;
use util\address\{XmlStreaming, ValueOf};
class Document {
private $stream;
public function __construct(InputStream $in) {
$this->stream= new XmlStreaming($in);
}
/** Returns all `<w:p>` and `<w:tbl>` elements */
public function sections($styles= [], $numbering= [], $links= []): iterable {
$definition= [
'w:hyperlink' => fn($self) => $self->add(yield new ValueOf(new HyperLink(), [
'@w:anchor' => fn($self) => $self->anchor= yield,
'@r:id' => fn($self) => $self->base= $links[yield],
'w:r/w:t' => fn($self) => $self->text.= yield,
])),
'w:r' => fn($self) => $self->add(yield new ValueOf(new Run(), [
'w:rPr/w:b' => fn($self) => $self->enclose(Bold::class) && yield,
'w:rPr/w:i' => fn($self) => $self->enclose(Italic::class) && yield,
'w:rPr/w:u' => fn($self) => $self->enclose(Underline::class) && yield,
'w:rPr/w:strike' => fn($self) => $self->enclose(StrikeThrough::class) && yield,
'w:t' => fn($self) => $self->text(yield),
'w:br' => fn($self) => $self->add(new LineBreak()) && yield,
'w:sym' => fn($self) => $self->add(yield new ValueOf(new Symbol(), [
'@w:char' => fn($self) => $self->char= yield,
'@w:font' => fn($self) => $self->table= strtolower(yield),
])),
'mc:AlternateContent/mc:Fallback/w:t' => fn($self) => $self->text(yield),
])),
'w:pPr/w:pStyle/@w:val' => fn($self) => $self->style($styles[yield] ?? null),
'w:pPr/w:numPr/w:numId/@w:val' => fn($self) => $self->list($numbering[yield] ?? 'bullet'),
];
$it= $this->stream->getIterator(true);
while ($it->valid()) {
switch ($it->key()) {
case '//w:body/w:p':
$paragraph= $this->stream->value(new ValueOf(new Paragraph(), $definition));
// Glue together paragraphs marked as list into a single list
if ($paragraph->list) {
$list= new Listing($paragraph->list);
do {
$list->add($paragraph);
$it->next();
$paragraph= $this->stream->value(new ValueOf(new Paragraph(), $definition));
} while ($paragraph->list && $it->valid());
yield $list;
}
yield $paragraph;
break;
case '//w:body/w:tbl':
yield $this->stream->value(new ValueOf(new Table(), [
'w:tr' => fn($self) => $self->add(yield new ValueOf(new Row(), [
'w:tc' => fn($self) => $self->add(yield new ValueOf(new Cell(), [
'w:p' => fn($self) => $self->add(yield new ValueOf(new Paragraph(), $definition))
])),
])),
]));
break;
default: // Ignore
}
$it->next();
}
}
}
<?php
use lang\Value;
use util\Comparison;
abstract class Element implements Value {
use Comparison;
abstract function emit($emit, ... $args);
}
<?php
use io\streams\OutputStream;
abstract class Emitter {
public function __construct(protected OutputStream $out) { }
public function container($container) {
foreach ($container->contents as $content) {
$content->emit($this);
}
}
public function run($run) {
foreach ($run->all() as $content) {
$content->emit($this);
}
}
}
<?php
use io\File;
use lang\IllegalArgumentException;
use util\cmd\Console;
use io\archive\zip\ZipFile from 'xp-framework/zip';
use util\address\XmlStreaming from 'xp-forge/address';
$rels= new Relationships();
$strings= new SharedStrings();
$styles= new CellStyles();
// Select emitter
$impl= $argv[2] ?? Markdown::class;
$emit= new $impl(Console::$out->stream());
$z= ZipFile::open(new File($argv[1]));
try {
$workbook= null;
$entries= [];
foreach ($z->entries() as $entry) {
switch ($entry->getName()) {
case 'xl/_rels/workbook.xml.rels':
$rels->read($entry->in());
break;
case 'xl/styles.xml':
$styles->read($entry->in());
break;
case 'xl/sharedStrings.xml':
$strings->read($entry->in());
break;
case 'xl/workbook.xml':
$workbook= new Workbook($entry->in());
break;
default:
$entries[$entry->getName()]= $entry->in();
break;
}
}
if (null === $workbook) throw new IllegalArgumentException('No workbook contained in '.$argv[1]);
// Debug mode: Print relationships
if ($argv[3] ?? null) {
Console::writeLine($rels);
Console::writeLine($styles);
Console::writeLine($strings);
}
foreach ($workbook->sheets($rels->links) as $name => $path) {
(new Header(2, $name))->emit($emit);
$sheet= new Spreadsheet($entries["xl/{$path}"]);
(new Table($sheet->records($styles, $strings)))->emit($emit);
}
} finally {
$z->close();
}
<?php
class Header extends Container {
public $level;
public function __construct(int $level, string|array|Element $arg) {
$this->level= $level;
parent::__construct($arg);
}
public function emit($emit, ... $args) {
return $emit->header($this, ...$args);
}
}
<?php
class Html extends Emitter {
public function text($text) {
$this->out->write(htmlspecialchars($text->value));
}
public function symbol($symbol) {
$this->out->write($symbol->resolve() ?? "&#{$symbol->char};");
}
public function link($link) {
$this->out->write('<a href="'.htmlspecialchars($link->uri()).'">'.htmlspecialchars($link->text).'</a>');
}
public function lineBreak($lineBreak) {
$this->out->write("<br>\n");
}
public function header($header) {
$l= min($header->level, 6);
$this->out->write("<h{$l}>");
$this->container($header);
$this->out->write("</h{$l}>\n");
}
public function bold($bold) {
$this->out->write('<b>');
$this->container($bold);
$this->out->write('</b>');
}
public function italic($italic) {
$this->out->write('<i>');
$this->container($italic);
$this->out->write('</i>');
}
public function underline($underline) {
$this->out->write('<u>');
$this->container($underline);
$this->out->write('</u>');
}
public function strikeThrough($strikeThrough) {
$this->out->write('<del>');
$this->container($strikeThrough);
$this->out->write('<del>');
}
public function paragraph($paragraph) {
$this->out->write("<p>");
$this->container($paragraph);
$this->out->write("</p>\n");
}
public function quote($quote) {
$this->out->write("<blockquote>");
$this->container($quote);
$this->out->write("</blockquote>\n");
}
public function listing($listing) {
$this->out->write("<{$listing->type}>");
foreach ($listing->elements as $element) {
$this->out->write('<li>');
if ($element instanceof Paragraph) {
$this->container($element);
} else {
$element->emit($this);
}
$this->out->write('</li>');
}
$this->out->write("</{$listing->type}>\n");
}
public function table($table) {
$it= $table->rows();
if ($it->valid()) {
$this->out->write("<table>\n");
// Table header
$header= $it->current();
$header->emit($this, 'th');
$it->next();
// Table body
while ($it->valid()) {
$it->current()->emit($this);
$it->next();
}
$this->out->write("</table>\n");
}
}
public function row($row, $cells= 'td') {
$this->out->write(" <tr>");
foreach ($row->cells as $cell) {
$cell->emit($this, $cells);
}
$this->out->write("</tr>\n");
}
public function value($value, $type= 'td') {
$this->out->write("<{$type}>".htmlspecialchars($value->format())."</{$type}>");
}
public function cell($cell, $type= 'td') {
$this->out->write("<{$type}>");
$this->container($cell->fold());
$this->out->write("</{$type}>");
}
public function shape($shape) {
$this->out->write("<div>");
$this->container($shape);
$this->out->write("</div>");
}
}
<?php
class Italic extends Container {
public function emit($emit, ... $args) {
return $emit->italic($this, ...$args);
}
}
<?php
class LineBreak extends Element {
public function emit($emit, ... $args) {
return $emit->lineBreak($this, ...$args);
}
/** @return string */
public function toString() { return nameof($this); }
}
<?php
class Listing extends Element {
public $elements= [];
public function __construct(public string $type) { }
public function add(parent $element) {
$this->elements[]= $element;
}
public function emit($emit, ... $args) {
return $emit->listing($this, ...$args);
}
public function toString() {
$s= nameof($this)."@[\n";
foreach ($this->elements as $i => $element) {
$s.= " $i: ".$element->toString()."\n";
}
return $s."]";
}
}
<?php
class Markdown extends Emitter {
public function text($text) {
$this->out->write($text->value);
}
public function symbol($symbol) {
$this->out->write($symbol->resolve());
}
public function link($link) {
$this->out->write("[{$link->text}]({$link->uri()})");
}
public function lineBreak($lineBreak) {
$this->out->write("\n");
}
public function header($header) {
$this->out->write(str_repeat('#', $header->level).' ');
$this->container($header);
$this->out->write("\n\n");
}
public function bold($bold) {
$this->out->write('*');
$this->container($bold);
$this->out->write('*');
}
public function italic($italic) {
$this->out->write('_');
$this->container($italic);
$this->out->write('_');
}
public function underline($underline) {
$this->container($underline);
}
public function strikeThrough($strikeThrough) {
$this->out->write('~~');
$this->container($strikeThrough);
$this->out->write('~~');
}
public function paragraph($paragraph) {
$this->container($paragraph);
$this->out->write("\n\n");
}
public function quote($quote) {
$out= $this->out;
try {
$this->out= new PrefixLines('> ', $this->out);
foreach ($quote->contents as $content) {
$content->emit($this);
}
} finally {
$this->out= $out;
$this->out->write("\n\n");
}
}
public function listing($listing) {
$li= 'ol' === $listing->type ? '1. ' : '* ';
foreach ($listing->elements as $element) {
$this->out->write($li);
// If the list element is a paragraph, yield its contents. Otherwise
// use the element as-is. Prevents superfluous double line breaks.
if ($element instanceof Paragraph) {
$this->container($element);
} else {
$element->emit($this);
}
$this->out->write("\n");
}
$this->out->write("\n\n");
}
public function table($table) {
$it= $table->rows();
if ($it->valid()) {
// Table header
$header= $it->current();
$this->row($header);
$this->out->write('|'.str_repeat(' -- |', sizeof($header->cells))."\n");
$it->next();
// Table body
while ($it->valid()) {
$it->current()->emit($this);
$it->next();
}
$this->out->write("\n");
}
}
public function row($row) {
foreach ($row->cells as $cell) {
$this->out->write('| ');
$cell->emit($this);
$this->out->write(' ');
}
$this->out->write("|\n");
}
public function value($value) {
$this->out->write($value->format());
}
public function cell($cell) {
$this->container($cell->fold());
}
public function shape($shape) {
$this->container($shape);
$this->out->write("\n\n");
}
}
<?php
class NumberFormat {
const MINUTES= [
'h' => 'G',
'hh' => 'H',
'm' => 'i',
'mm' => 'i',
's' => 's', // FIXME: "03" instead of "3"
'ss' => 's',
];
const DATES= [
'yy' => 'y',
'yyyy' => 'Y',
'm' => 'n',
'mm' => 'm',
'mmm' => 'M',
'mmmm' => 'F',
'mmmmm' => 'M', // FIXME: "Jan" instead of "J"
'd' => 'j',
'dd' => 'd',
'ddd' => 'D',
'dddd' => 'l',
'h' => 'G',
'hh' => 'H',
's' => 's', // FIXME: "03" instead of "3"
'ss' => 's',
];
public function __construct(private string $code) { }
/** Converts number to fraction */
private function fraction(float $value): string {
$n= (int)$value;
$d= abs($value - $n);
if ($d < 1e-6) return "{$value}/1";
$rest= $denominator= 1000000;
$gcd= $numerator= round($d * $denominator);
while ($rest !== 0) {
$temp= $rest;
$rest= $gcd % $rest;
$gcd= $temp;
}
$frac= ($numerator / $gcd).'/'.($denominator / $gcd);
if ($value <= -1 || $value >= 1) {
return "{$n} {$frac}";
} else if ($value < 0) {
return "-{$frac}";
} else {
return $frac;
}
}
/** Converts Excel date to Un*x timestamp */
private function date(string $format, float $value): string {
return gmdate($format, ($value - 25569) * 86400);
}
/** Converts Excel duration */
private function duration(string $format, float $value): string {
return match ($format) {
'd' => sprintf('%d', $value),
'dd' => sprintf('%02d', $value),
'h' => sprintf('%d', $value * 24),
'hh' => sprintf('%02d', $value * 24),
'm' => sprintf('%d', $value * 1440),
'mm' => sprintf('%02d', $value * 1440),
's' => sprintf('%d', $value * 86400),
'ss' => sprintf('%02d', $value * 86400),
};
}
public function format($value) {
// Handle escape sequences, see https://bz.apache.org/ooo/show_bug.cgi?id=70003
if ('[' === $this->code[0]) {
$o= strpos($this->code, ']');
$escape= substr($this->code, 1, $o - 1);
switch ($escape) {
case '$-F400': return $this->date('H:i:s', $value);
case '$-F800': return $this->date('Y-m-d', $value);
// Fall through
}
}
// TODO: String formatting
if (null === $value) return '';
if (is_string($value)) return $value;
// If you use the "m" or "mm" code immediately after the "h" or "hh" code (for hours)
// or immediately before the "ss" code (for seconds), Excel displays minutes instead
// of the month. Handle this via the special named pattern `minutes`.
return preg_replace_callback(
[
'/(?P<fraction>[\?#0]+ \?+\/\?+)/',
'/(?P<number>[,\?#0]+(\.[\?#0]+)?)(%?)/',
'/(?P<escape>\\\\.)/',
'/(?P<quoted>"[^"]+")/',
'/(?P<minutes>\[?(h{1,2})\]?([^m]+)(m{1,2})|(m{1,2})([^s]+)(s{1,2}))/', // see above
'/(?P<date>\[?([ymdh]+)\]?|s{1,2}(\.0+)?)/'
],
function($m) use($value) {
if (isset($m['number'])) {
$decimal= isset($m[2]) ? strlen($m[2]) - 1 : 0;
return '%' === $m[3]
? number_format($value * 100, $decimal, '.').'%'
: number_format($value, $decimal, '.', strpos($m[0], ',') ? ',' : null)
;
} else if (isset($m['escape'])) {
return $m[0][1];
} else if (isset($m['quoted'])) {
return substr($m[0], 1, -1);
} else if (isset($m['duration'])) {
return $this->duration($m[2], $value);
} else if (isset($m['minutes'])) {
return '[' === $m[0][0]
? $this->duration($m[2], $value).$m[3].$this->date(self::MINUTES[$m[4]], $value)
: $this->date(self::MINUTES[$m[2]].$m[3].self::MINUTES[$m[4]], $value)
;
} else if (isset($m['date'])) {
return '[' === $m[0][0] ? $this->duration($m[1], $value) : $this->date(self::DATES[$m[1]], $value);
} else if (isset($m['fraction'])) {
return $this->fraction($value);
} else {
return $value;
}
},
$this->code
);
}
/** @return string */
public function code() { return "`{$this->code}`"; }
}
<?php
use io\streams\InputStream;
use util\address\{XmlStreaming, ValueOf};
class Numbering {
public $lookup= [];
public function read(InputStream $in) {
$stream= new XmlStreaming($in);
// Numbering formats are specified in abstract and concrete sections
// Parse these into lookups maps first...
$levels= $numbers= [];
foreach ($stream->pointers() as $key => $pointer) {
switch ($key) {
case '//w:abstractNum':
$abstract= $pointer->value(new ValueOf(['levels' => []], [
'@w:abstractNumId' => fn(&$self) => $self['id']= yield,
'w:lvl' => fn(&$self) => $self['levels'][]= yield new ValueOf([], [
'@w:ilvl' => fn(&$self) => $self['level']= yield,
'w:numFmt/@w:val' => fn(&$self) => $self['format']= yield,
])
]));
$levels[$abstract['id']]= $abstract['levels'];
break;
case '//w:num':
$num= $pointer->value(new ValueOf([], [
'@w:numId' => fn(&$self) => $self['id']= yield,
'w:abstractNumId/@w:val' => fn(&$self) => $self['ref']= yield,
]));
$numbers[$num['id']]= $num['ref'];
break;
}
}
// ...and merge them later, using only the level 0 formatting
foreach ($numbers as $id => $ref) {
foreach ($levels[$ref] as $level) {
if ('0' === $level['level']) {
$this->lookup[$id]= $level['format'];
break;
}
}
}
}
}
<?php
class Paragraph extends Container {
const LISTS= ['bullet' => 'ul', 'decimal' => 'ol'];
public $list= null;
public $style= null;
public function add(Element $content) {
// Merge runs, optimizing [Run('Test'), Run('s')] => Run('Tests')
if ($this->contents && $last= $this->contents[sizeof($this->contents) - 1]) {
if ($content instanceof Run && $last instanceof Run && $last->merge($content)) return true;
}
return parent::add($content);
}
public function list($type) {
$this->list= self::LISTS[$type] ?? 'ul';
}
public function style($id) {
$this->style= $id;
}
public function emit($emit, ... $args) {
// Convert titles to <h1>, all other headings to <h[n]> where n = level + 1
// Convert paragraph with "Quote" style to <blockquote> elements.
// See https://stackoverflow.com/questions/51670198/read-word-document-and-get-text-for-each-heading
if ($this->style) {
if (0 === strncasecmp($this->style, 'Title', 5)) {
return (new Header(1, $this->contents))->emit($emit, ...$args);
} else if (0 === strncasecmp($this->style, 'Quote', 5)) {
return (new Quote($this->contents))->emit($emit, ...$args);
} else if (sscanf($this->style, '%*[hH]eading %d', $level)) {
return (new Header($level + 1, $this->contents))->emit($emit, ...$args);
}
}
// All other paragraphs are just rendered as <p>...</p>
return $emit->paragraph($this, ...$args);
}
public function toString() {
$s= nameof($this);
// Show optional attributes if set
$a= '';
null === $this->list || $a.= ", list: {$this->list}";
null === $this->style || $a.= ", style: {$this->style}";
if ($a) $s.= '('.substr($a, 2).')';
// Show child nodes w/ special-case handling for an empty list
if (empty($this->contents)) return $s.'@[]';
$s.= '@[';
foreach ($this->contents as $element) {
$s.= "\n ".str_replace("\n", "\n ", $element->toString());
}
return $s."\n]";
}
}
<?php
use io\File;
use lang\IllegalArgumentException;
use util\cmd\Console;
use io\archive\zip\ZipFile from 'xp-framework/zip';
use util\address\XmlStreaming from 'xp-forge/address';
$rels= new Relationships();
// Select emitter
$impl= $argv[2] ?? Markdown::class;
$emit= new $impl(Console::$out->stream());
$z= ZipFile::open(new File($argv[1]));
try {
$presentation= null;
$entries= [];
foreach ($z->entries() as $entry) {
switch ($entry->getName()) {
case 'ppt/_rels/presentation.xml.rels':
$rels->read($entry->in());
break;
case 'ppt/presentation.xml':
$presentation= new Presentation($entry->in());
break;
default:
$entries[$entry->getName()]= $entry->in();
break;
}
}
if (null === $presentation) throw new IllegalArgumentException('No presentation contained in '.$argv[1]);
// Debug mode: Print relationships
if ($argv[3] ?? null) {
Console::writeLine($rels);
}
$i= 1;
foreach ($presentation->slides($rels->links) as $id => $path) {
(new Header(2, 'Slide #'.($i++)))->emit($emit);
$slide= new Slide($entries["ppt/{$path}"]);
foreach ($slide->shapes() as $shape) {
$shape->emit($emit);
}
}
} finally {
$z->close();
}
<?php
use io\streams\OutputStream;
class PrefixLines implements OutputStream {
public function __construct(private string $text, private OutputStream $out) {
$out->write($text);
}
public function write($bytes) {
$this->out->write(str_replace("\n", "\n{$this->text}", $bytes));
}
public function flush() { $this->out->flush(); }
public function close() { $this->out->close(); }
}
<?php
use io\streams\InputStream;
use util\address\{XmlStreaming, ValueOf};
class Presentation {
private $stream;
public function __construct(InputStream $in) {
$this->stream= new XmlStreaming($in);
}
public function slides($rels) {
foreach ($this->stream->pointers('//p:sldIdLst/p:sldId') as $pointer) {
$sheet= $pointer->value(new ValueOf([], [
'@id' => fn(&$self) => $self['id']= yield,
'@r:id' => fn(&$self) => $self['path']= $rels[yield],
]));
yield $sheet['id'] => $sheet['path'];
}
}
}
<?php
class Quote extends Container {
public function emit($emit, ... $args) {
return $emit->quote($this, ...$args);
}
}
<?php
class Record extends Element {
public $cells= [];
/** Returns a single cell by ID or index, or NULL */
public function cell(int|string $arg): ?Value {
return $this->cells[is_int($arg) ? $arg : $this->index($arg)] ?? null;
}
/** Convert cell IDs to cell index, e.g. A1 => 0, Z1 => 26, AA1 => 27 */
private function index($id) {
$c= 0;
for ($i= 0, $s= strlen($id); $i < $s && $id[$i] >= 'A'; $i++) {
$c+= ord($id[$i]) - 64;
}
return $c;
}
public function add(string $id, Value $cell) {
$index= $this->index($id);
// Fill in gaps with empty cells
for ($i= $index - 1; $i > 0 & !isset($this->cells[$i]); $i--) {
$this->cells[$i]= new Value(null);
}
$this->cells[$index]= $cell;
}
public function emit($emit, ... $args) {
return $emit->row($this, ...$args);
}
public function toString() {
$s= nameof($this)."@[\n";
foreach ($this->cells as $i => $cell) {
$s.= " $i: ".$cell->toString()."\n";
}
return $s."]";
}
}
<?php
use io\streams\InputStream;
use util\address\{XmlStreaming, ValueOf};
class Relationships {
public $links= [];
public function read(InputStream $in) {
$attr= ['@*' => fn(&$self, $name) => $self[$name]= yield];
// Stream nodes of the kind `<Relationship Id="rId4" Target="https://example.com/"/>`
$stream= new XmlStreaming($in);
foreach ($stream->pointers('//Relationship') as $pointer) {
$rel= $pointer->value(new ValueOf([], $attr));
$this->links[$rel['Id']]= $rel['Target'];
}
}
}
<?php
class Row extends Element {
public $cells= [];
public function add(Cell $cell) {
$this->cells[]= $cell;
}
public function emit($emit, ... $args) {
return $emit->row($this, ...$args);
}
public function toString() {
$s= nameof($this)."@[\n";
foreach ($this->cells as $i => $cell) {
$s.= " $i: ".$cell->toString()."\n";
}
return $s."]";
}
}
<?php
class Run extends Container {
private $enclose= null;
public function enclose($class) {
$this->enclose= $class;
return true;
}
public function text($value) {
$this->contents[]= new Text($value);
return true;
}
public function merge(self $run) {
if ($this->enclose !== $run->enclose) return false;
if (empty($this->contents)) {
$this->contents= $run->contents;
} else {
$last= $this->contents[sizeof($this->contents) - 1];
$append= $last instanceof Text;
foreach ($run->contents as $content) {
if ($append && $content instanceof Text) {
$last->value.= $content->value;
} else {
$this->contents[]= $content;
$append= false;
}
}
}
return true;
}
public function all() {
if (null === $this->enclose) {
return $this->contents;
} else {
return [new ($this->enclose)($this->contents)];
}
}
public function emit($emit, ... $args) {
return $emit->run($this, ...$args);
}
public function toString() {
$s= nameof($this);
if (null !== $this->enclose) {
$s.= '(enclose: '.strtr($this->enclose, '\\', '.').')';
}
$c= '';
foreach ($this->contents as $content) {
$c.= ', '.$content->toString();
}
return $s.'@['.substr($c, 2).']';
}
}
<?php
class Shape extends Container {
public function emit($emit, ... $args) {
return $emit->shape($this, ...$args);
}
}
<?php
use io\streams\InputStream;
use lang\Value;
use util\address\{XmlStreaming, ValueOf};
use util\Comparison;
class SharedStrings implements Value {
use Comparison;
public $lookup= [];
public function read(InputStream $in) {
$stream= new XmlStreaming($in);
foreach ($stream->pointers('//si') as $pointer) {
$this->lookup[]= $pointer->value(new ValueOf('', [
't' => fn(&$self) => $self= yield,
'r/t' => fn(&$self) => $self.= yield,
]));
}
}
/** @return string */
public function toString() {
$s= nameof($this)."@[\n";
foreach ($this->lookup as $id => $string) {
$s.= sprintf(" [%04d] `%s`\n", $id, addcslashes($string, "\0..\37"));
}
return $s.']';
}
}
<?php
use io\streams\InputStream;
use util\address\{XmlStreaming, ValueOf};
class Slide {
private $stream;
public function __construct(InputStream $in) {
$this->stream= new XmlStreaming($in);
}
public function shapes() {
foreach ($this->stream->pointers('//p:cSld/p:spTree/p:sp') as $pointer) {
yield $pointer->value(new ValueOf(new Shape(), [
'p:txBody/a:p/a:r' => fn($self) => $self->add(yield new ValueOf(new Run(), [
'a:rPr/@b' => fn($self) => '1' === yield && $self->enclose(Bold::class),
'a:rPr/@i' => fn($self) => '1' === yield && $self->enclose(Italic::class),
'a:rPr/@u' => fn($self) => 'none' !== yield && $self->enclose(Underline::class),
'a:rPr/@strike' => fn($self) => 'noStrike' !== yield && $self->enclose(StrikeThrough::class),
'a:t' => fn($self) => $self->text(yield),
])),
]));
}
}
}
<?php
use io\streams\InputStream;
use lang\MethodNotImplementedException;
use util\address\{XmlStreaming, ValueOf};
use util\Objects;
class Spreadsheet {
private static $builtin= [];
private $stream;
static function __static() {
// See https://github.com/ClosedXML/ClosedXML/wiki/NumberFormatId-Lookup-Table
self::$builtin= [
0x01 => new NumberFormat('0'),
0x02 => new NumberFormat('0.00'),
0x03 => new NumberFormat('#,##0'),
0x04 => new NumberFormat('#,##0.00'),
0x09 => new NumberFormat('0%'),
0x0a => new NumberFormat('0.00%'),
0x0b => new NumberFormat('0.00E+00'),
0x0c => new NumberFormat('# ?/?'),
0x0d => new NumberFormat('# ??/??'),
0x0e => new NumberFormat('d.m.yyyy'),
0x0f => new NumberFormat('d-mmm-yy'),
0x10 => new NumberFormat('d-mmm'),
0x11 => new NumberFormat('mmm-yy'),
0x12 => new NumberFormat('h:mm tt'),
0x13 => new NumberFormat('h:mm:ss tt'),
0x14 => new NumberFormat('H:mm'),
0x15 => new NumberFormat('H:mm:ss'),
0x16 => new NumberFormat('m/d/yyyy H:mm'),
0x25 => new NumberFormat('#,##0 ;(#,##0)'),
0x26 => new NumberFormat('#,##0 ;[Red](#,##0)'),
0x27 => new NumberFormat('#,##0.00;(#,##0.00)'),
0x28 => new NumberFormat('#,##0.00;[Red](#,##0.00)'),
0x2d => new NumberFormat('mm:ss'),
0x2e => new NumberFormat('[h]:mm:ss'),
0x2f => new NumberFormat('mmss.0'),
0x30 => new NumberFormat('##0.0E+0'),
];
}
public function __construct(InputStream $in) {
$this->stream= new XmlStreaming($in);
}
public function records(CellStyles $styles, SharedStrings $strings) {
$it= $this->stream->getIterator(true);
while ($it->valid()) {
if ('//sheetData/row' === $it->key()) {
yield $this->stream->value(new ValueOf(new Record(), [
'c' => function($self) use($styles, $strings) {
$cell= yield new ValueOf(['type' => null, 'style' => null], [
'@t' => fn(&$self) => $self['type']= yield,
'@s' => fn(&$self) => $self['style']= yield,
'@r' => fn(&$self) => $self['id']= yield,
'v' => fn(&$self) => $self['value']= yield,
]);
// Convert to given type
$value= match ($cell['type']) {
null => (float)($cell['value'] ?? 0.0),
's' => $strings->lookup[$cell['value']],
'b' => '1' === $cell['value'],
'str' => $cell['value'],
default => throw new MethodNotImplementedException('Unhandled type', Objects::stringOf($cell)),
};
// Format
if ($xf= $styles->cellXfs[$cell['style']] ?? null) {
$format= self::$builtin[$xf] ?? $styles->numFmts[$xf] ?? null;
} else {
$format= null;
}
$self->add($cell['id'], new Value($value, $format));
}
]));
}
$it->next();
}
}
}
<?php
class StrikeThrough extends Container {
public function emit($emit, ... $args) {
return $emit->strikeThrough($this, ...$args);
}
}
<?php
use io\streams\InputStream;
use util\address\{XmlStreaming, ValueOf};
class Styles {
public $lookup= [];
public function read(InputStream $in) {
$stream= new XmlStreaming($in);
foreach ($stream->pointers('//w:style') as $pointer) {
$style= $pointer->value(new ValueOf([], [
'@w:styleId' => fn(&$self) => $self['id']= yield,
'w:name/@w:val' => fn(&$self) => $self['name']= yield,
]));
$this->lookup[$style['id']]= $style['name'];
}
}
}
<?php
// See https://www.alanwood.net/demos/wingdings.html
class Symbol extends Element {
const LOOKUP= [
'wingdings' => [
'F020' => '',
'F021' => '🖉',
'F022' => '✂',
'F023' => '✁',
'F024' => '👓',
'F025' => '🕭',
'F026' => '🕮',
'F027' => '🕯',
'F028' => '🕿',
'F029' => '✆',
'F02A' => '🖂',
'F02B' => '🖃',
'F02C' => '📪',
'F02D' => '📫',
'F02E' => '📬',
'F02F' => '📭',
'F030' => '📁',
'F031' => '📂',
'F032' => '📄',
'F033' => '🗏',
'F034' => '🗐',
'F035' => '🗄',
'F036' => '⌛',
'F037' => '🖮',
'F038' => '🖰',
'F039' => '🖲',
'F03A' => '🖳',
'F03B' => '🖴',
'F03C' => '🖫',
'F03D' => '🖬',
'F03E' => '✇',
'F03F' => '✍',
'F040' => '🖎',
'F041' => '✌',
'F042' => '👌',
'F043' => '👍',
'F044' => '👎',
'F045' => '☜',
'F046' => '☞',
'F047' => '☝',
'F048' => '☟',
'F049' => '🖐',
'F04A' => '☺',
'F04B' => '😐',
'F04C' => '☹',
'F04D' => '💣',
'F04E' => '☠',
'F04F' => '🏳',
'F050' => '🏱',
'F051' => '✈',
'F052' => '☼',
'F053' => '💧',
'F054' => '❄',
'F055' => '🕆',
'F056' => '✞',
'F057' => '🕈',
'F058' => '✠',
'F059' => '✡',
'F05A' => '☪',
'F05B' => '☯',
'F05C' => 'ॐ',
'F05D' => '☸',
'F05E' => '♈',
'F05F' => '♉',
'F060' => '♊',
'F061' => '♋',
'F062' => '♌',
'F063' => '♍',
'F064' => '♎',
'F065' => '♏',
'F066' => '♐',
'F067' => '♑',
'F068' => '♒',
'F069' => '♓',
'F06A' => '🙰',
'F06B' => '🙵',
'F06C' => '●',
'F06D' => '🔾',
'F06E' => '■',
'F06F' => '□',
'F070' => '🞐',
'F071' => '❑',
'F072' => '❒',
'F073' => '⬧',
'F074' => '⧫',
'F075' => '◆',
'F076' => '❖',
'F077' => '⬥',
'F078' => '⌧',
'F079' => '⮹',
'F07A' => '⌘',
'F07B' => '🏵',
'F07C' => '🏶',
'F07D' => '🙶',
'F07E' => '🙷',
'F080' => '⓪',
'F081' => '①',
'F082' => '②',
'F083' => '③',
'F084' => '④',
'F085' => '⑤',
'F086' => '⑥',
'F087' => '⑦',
'F088' => '⑧',
'F089' => '⑨',
'F08A' => '⑩',
'F08B' => '⓿',
'F08C' => '❶',
'F08D' => '❷',
'F08E' => '❸',
'F08F' => '❹',
'F090' => '❺',
'F091' => '❻',
'F092' => '❼',
'F093' => '❽',
'F094' => '❾',
'F095' => '❿',
'F096' => '🙢',
'F097' => '🙠',
'F098' => '🙡',
'F099' => '🙣',
'F09A' => '🙞',
'F09B' => '🙜',
'F09C' => '🙝',
'F09D' => '🙟',
'F09E' => '·',
'F09F' => '•',
'F0A0' => '▪',
'F0A1' => '⚪',
'F0A2' => '🞆',
'F0A3' => '🞈',
'F0A4' => '◉',
'F0A5' => '◎',
'F0A6' => '🔿',
'F0A7' => '▪',
'F0A8' => '◻',
'F0A9' => '🟂',
'F0AA' => '✦',
'F0AB' => '★',
'F0AC' => '✶',
'F0AD' => '✴',
'F0AE' => '✹',
'F0AF' => '✵',
'F0B0' => '⯐',
'F0B1' => '⌖',
'F0B2' => '⟡',
'F0B3' => '⌑',
'F0B4' => '⯑',
'F0B5' => '✪',
'F0B6' => '✰',
'F0B7' => '🕐',
'F0B8' => '🕑',
'F0B9' => '🕒',
'F0BA' => '🕓',
'F0BB' => '🕔',
'F0BC' => '🕕',
'F0BD' => '🕖',
'F0BE' => '🕗',
'F0BF' => '🕘',
'F0C0' => '🕙',
'F0C1' => '🕚',
'F0C2' => '🕛',
'F0C3' => '⮰',
'F0C4' => '⮱',
'F0C5' => '⮲',
'F0C6' => '⮳',
'F0C7' => '⮴',
'F0C8' => '⮵',
'F0C9' => '⮶',
'F0CA' => '⮷',
'F0CB' => '🙪',
'F0CC' => '🙫',
'F0CD' => '🙕',
'F0CE' => '🙔',
'F0CF' => '🙗',
'F0D0' => '🙖',
'F0D1' => '🙐',
'F0D2' => '🙑',
'F0D3' => '🙒',
'F0D4' => '🙓',
'F0D5' => '⌫',
'F0D6' => '⌦',
'F0D7' => '⮘',
'F0D8' => '⮚',
'F0D9' => '⮙',
'F0DA' => '⮛',
'F0DB' => '⮈',
'F0DC' => '⮊',
'F0DD' => '⮉',
'F0DE' => '⮋',
'F0DF' => '🡨',
'F0E0' => '🡪',
'F0E1' => '🡩',
'F0E2' => '🡫',
'F0E3' => '🡬',
'F0E4' => '🡭',
'F0E5' => '🡯',
'F0E6' => '🡮',
'F0E7' => '🡸',
'F0E8' => '🡺',
'F0E9' => '🡹',
'F0EA' => '🡻',
'F0EB' => '🡼',
'F0EC' => '🡽',
'F0ED' => '🡿',
'F0EE' => '🡾',
'F0EF' => '⇦',
'F0F0' => '⇨',
'F0F1' => '⇧',
'F0F2' => '⇩',
'F0F3' => '⬄',
'F0F4' => '⇳',
'F0F5' => '⬀',
'F0F6' => '⬁',
'F0F7' => '⬃',
'F0F8' => '⬂',
'F0F9' => '🢬',
'F0FA' => '🢭',
'F0FB' => '🗶',
'F0FC' => '✔',
'F0FD' => '🗷',
'F0FE' => '🗹',
]
];
public $char, $table;
public function emit($emit, ... $args) {
return $emit->symbol($this, ...$args);
}
public function resolve() { return self::LOOKUP[$this->table][$this->char] ?? null; }
public function toString() {
return nameof($this)."(char: {$this->char}, table: {$this->table})";
}
}
<?php
class Table extends Element {
private $iterable;
public function __construct(iterable $iterable= null) {
$this->iterable= null === $iterable ? [] : [$iterable];
}
public function stream($iterable) {
$this->iterable[]= $iterable;
return $this;
}
public function add(Row $row) {
$this->iterable[]= [$row];
}
public function rows() {
foreach ($this->iterable as $it) {
yield from $it;
}
}
public function emit($emit, ... $args) {
return $emit->table($this, ...$args);
}
public function toString() {
$s= nameof($this).'@[';
foreach ($this->rows as $row) {
$s.= "\n ".str_replace("\n", "\n ", $row->toString());
}
return $s."\n]";
}
}
<?php
class Text extends Element {
public $value;
public function __construct($value) {
$this->value= $value;
}
public function emit($emit, ... $args) {
return $emit->text($this, ...$args);
}
public function toString() { return nameof($this).'("'.$this->value.'")'; }
}
<?php
class Underline extends Container {
public function emit($emit, ... $args) {
return $emit->underline($this, ...$args);
}
}
<?php
use util\Objects;
class Value extends Element {
private $value, $numbers;
public function __construct($value, NumberFormat $numbers= null) {
$this->value= $value;
$this->numbers= $numbers;
}
public function format() {
return $this->numbers ? $this->numbers->format($this->value) : $this->value;
}
public function emit($emit, ... $args) {
return $emit->value($this, ...$args);
}
public function toString() {
return sprintf(
'%s(%s%s)',
nameof($this),
Objects::stringOf($this->value),
$this->numbers ? " -> {$this->numbers->code()}" : ''
);
}
}
<?php
use io\File;
use lang\IllegalArgumentException;
use util\cmd\Console;
use io\archive\zip\ZipFile from 'xp-framework/zip';
use util\address\XmlStreaming from 'xp-forge/address';
// Select emitter
$impl= $argv[2] ?? Markdown::class;
$emit= new $impl(Console::$out->stream());
$styles= new Styles();
$numbering= new Numbering();
$rels= new Relationships();
$z= ZipFile::open(new File($argv[1]));
try {
$doc= null;
foreach ($z->entries() as $entry) {
switch ($entry->getName()) {
case 'word/styles.xml':
$styles->read($entry->in());
break;
case 'word/numbering.xml':
$numbering->read($entry->in());
break;
case 'word/_rels/document.xml.rels':
$rels->read($entry->in());
break;
case 'word/document.xml':
$doc= new Document($entry->in());
break;
default: // Ignore
}
}
if (null === $doc) throw new IllegalArgumentException('No document contained in '.$argv[1]);
// Debug mode: Print relationships
if ($argv[3] ?? null) {
Console::writeLine($rels);
Console::writeLine($styles);
}
foreach ($doc->sections($styles->lookup, $numbering->lookup, $rels->links) as $sections) {
$sections->emit($emit);
}
} finally {
$z->close();
}
<?php
use io\streams\InputStream;
use util\address\{XmlStreaming, ValueOf};
class Workbook {
private $stream;
public function __construct(InputStream $in) {
$this->stream= new XmlStreaming($in);
}
public function sheets($rels) {
foreach ($this->stream->pointers('//sheets/sheet') as $pointer) {
$sheet= $pointer->value(new ValueOf([], [
'@name' => fn(&$self) => $self['name']= yield,
'@r:id' => fn(&$self) => $self['path']= $rels[yield],
]));
yield $sheet['name'] => $sheet['path'];
}
}
}
@thekid
Copy link
Author

thekid commented Nov 5, 2023

Convert Word to HTML:

$ xp word.script.php Document.docx Html > document.html
# ...

Convert Excel to HTML:

$ xp excel.script.php Spreadsheet.docx Html > spreadsheet.html
# ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment