bpj · June 9, 2017 12:55
diff --git a/pandoc-class2style.pl b/pandoc-class2style.pl
 #!/usr/bin/env perl

 # pandoc-class2style.pl - filter to translate single span/div classes into LaTeX commands and attribute lists

 # POD documentation below the code!

 use utf8;
 use autodie 2.29;
 use 5.010001;
 use strict;
 use warnings;
 use warnings qw(FATAL utf8);

 use Carp qw[ carp croak ];

 use Pandoc::Elements 0.33;
 use Pandoc::Walker 0.27 qw[ action transform  ];

 # Ensure a value is a hashref
 sub _hashify ($$;%) {
    my ( $value, $default, %opt ) = @_;
    defined( $value ) or $value = {};
    'HASH' eq ref( $value )
    ## $value = { $default => $value } unless $opt{key};
    ## $value = { $value => $default } if $opt{key};
      or $value
      = +{ ( $opt{key} ? $value : $default ) => ( $opt{key} ? $default : $value ) };
    if ( exists $opt{clone} ) { # $opt{clone} defaults to true = make shallow clone
        return $value unless $opt{clone};
    }
    return +{%$value};
 }

 # Ensure a value is an array (list)
 sub _listify ($;%) {
    my ( $value, %opt ) = @_;
    defined( $value )        or $value = [];
    'ARRAY' eq ref( $value ) or $value = [$value];
    if ( exists $opt{clone} ) { # $opt{clone} defaults to true = make shallow clone
        return $value unless $opt{clone};
    }
    return [@$value];
 }

 # These formats require HTML-style output.  Add more as needed!
 my @html_formats = qw[ html html5 epub ];

 my $out_format = shift @ARGV;
 my $json       = <>;
 my $doc        = pandoc_json( $json );

 my $meta = $doc->meta;  # Metadata
 ## $meta->value('foo') = get value of metadata field 'foo' as plain Perl data structure

 # produce html-style output if an html output format is selected
 # or the metadata field 'class2style_html' has a true value
 my ( $to_html ) = ( grep { $_ eq $out_format } @html_formats );
 $to_html ||= $meta->value( 'class2style_html' );
 # produce docx-style output if docx output format is selected
 # or the metadata field 'class2style_docx' has a true value
 my $to_docx = ( 'docx' eq $out_format ) || $meta->value( 'class2style_docx' ) // 0;
 # produce latex-style output if latex output format is selected
 my $to_latex = 'latex' eq $out_format;

 # keep existing classes if the metadata field 'class2style_keep' has a true value
 my $keep_classes = $meta->value('class2style_keep') // 0;

 # get 'style' definitions from metadata field 'class2style' is a hashref
 my $style4class = $meta->value( 'class2style' ) // +{};
 if ( 'HASH' eq ref $style4class ) {
    if ( $to_html ) {
        # default to 'html' definitions for html-style output
        $style4class = $style4class->{$out_format} // $style4class->{html} // [];
    }
    elsif ( $to_docx ) {
        # use 'docx' definitions for docx output
        $style4class = $style4class->{docx} // [];
    }
    else {
        # default to output format name definitions
        $style4class = $style4class->{$out_format} // [];
    }
 }
 else {
    croak "Metadata-->class2style must be mapping";
 }

 # turn definitions into a list of strings and/or hashes
 $style4class = _listify $style4class;
 for my $item ( @$style4class ) {    # normalize items to hashrefs
    'HASH' eq ref $item and next;   # assume item is a string
    $item = +{ $item => $item };
 }

 # flatten list of hashes to hash
 $style4class = +{ map { ; %$_ } @$style4class };

 # for html output values should be hashes with attribute--value pairs
 if ( $to_html ) {
    for my $attrs ( values %$style4class ) {
        ## string values becomes the value of a key 'class'
        $attrs = _hashify $attrs, 'class';
    }
 }

 ## Alternative interface: pick up classes ending in a dot
 ## and use them as command/environment/style names
 my $class_re = qr/(?<!\S)(\pL+)\.(?!\S)/;


 # Perform different actions depending on output format/style
 my %actions = $to_latex ? (
    # wrap inline elements in a command
    'Span|Code' => sub {    # { for poor editor
        state $end_cmd = RawInline latex => '}';
        my ( $elem, $action ) = @_;
        my @commands = get_styles( $elem );
        return unless @commands;
        my $is_code = $elem->name =~ /Code/;
        ## recurse into child elements
        unless ( $is_code ) {
            transform( $elem->content, $action, $action );
        }
        ## replace a span with its content
        my @ret = $is_code ? $elem : @{ $elem->content };
        ## step thru commands in reverse order to keep left--right sequence
        for my $com ( reverse @commands ) {
            no warnings qw[ uninitialized numeric ]; # in case there is no AFTER
            ## COM becomes {before=>COM, after=>''} unless COM is a hash
            $com = _hashify $com, 'before';
            ## CONTENT becomes \BEFORE{CONTENT}AFTER
            ## wrap before in \ and { unless any
            $com->{before} =~ s/^(?!\\)/\\/;
            $com->{before} =~ s/\{?$/\{/;
            ## prepend } to after unless any
            $com->{after} =~ s/^(?!\})/\}/;
            unshift @ret, RawInline latex => $com->{before}; # prepend BEFORE
            push @ret, RawInline latex => $com->{after};    # append AFTER
        }
        return \@ret;
    },
    # Wrap block elements in an environment
    'Div|CodeBlock' => sub {
        my ( $elem, $action ) = @_;
        my @envs = get_styles( $elem );
        return unless @envs;
        my $is_code = $elem->name =~ /Code/;
        ## recurse into child elements
        unless ( $is_code ) {
            transform( $elem->content, $action, $action );
        }
        my @ret = $elem;
        for my $env ( reverse @envs ) {
            no warnings qw[ uninitialized numeric ]; # in case there are no arguments
            ## $env becomes {name=>ENVNAME, args=>''} unless $env is a hash
            $env = _hashify $env, 'name';
            ## prepend \begin{NAME}ARGS to block
            unshift @ret, RawBlock latex => "\\begin\{$env->{name}\}$env->{args}";
            ## append \end{NAME}
            push @ret, RawBlock latex => "\\end\{$env->{name}\}";
        }
        return \@ret;
    },
  )
  : $to_docx ? (
    # add "custom-style" attributes to elements, 
    # possibly after removing existing classes
    'Span|Div' => sub {
        my ( $elem, $action ) = @_;
        my @styles = get_styles( $elem );
        return unless @styles;
        transform( $elem->content, $action, $action );
        $elem->class("") unless $keep_classes;
        ## ['foo', 'bar', 'baz'] becomes 'FooBarBaz'
        ## since docx named styles aren't additive
        my $style = join "", map { ; ucfirst $_ } @styles;
        $elem->add_attribute( 'custom-style' => $style );
        return $elem;
    },
    # For DOCX code elements need to be wrapped in a container element
    'Code|CodeBlock' => sub {
        state $wrap  = [ \&Span, \&Div ];
        ### state $strip = [ \&Code, \&CodeBlock ];
        my ( $elem, $action ) = @_;
        my @styles = get_styles( $elem );
        return unless @styles;
        ## delete existing classes?
        $elem->class("") unless $keep_classes;
        my $style = join "", map { ; ucfirst $_ } @styles;
        my $type = $elem->is_block || 0;
        ### my $code = $strip->[$type]->( attributes {}, $elem->content );
        return $wrap->[$type]->( attributes + { 'custom-style' => $style }, $elem );
    },
  )

  # HTML output format
  # Add 'style' attributes to elements, 
  # possibly after removing existing classes
  ### XXX: we used to wrap a new element for each style
  : $to_html ? (
    'Span|Div|Code|CodeBlock' => sub {
        ### state $wraps = [ \&Span, \&Div ];
        my ( $elem, $action ) = @_;
        my @styles = get_styles( $elem );
        return unless @styles;
        ## recurse into child elements
        unless ( $elem->name =~ /Code/ ) {
            transform( $elem->content, $action, $action );
        }
        ### my $wrap = $wraps->[ $elem->is_block || 0 ];
        ### my $kv = $elem->keyvals;
        ## non-reference styles are classes.
        ## treat them specially for efficiency.
        my @classes = grep { !ref $_ } @styles;
        ## We turn them into a single 'style'
        push @styles, +{ class => "@classes" } if @classes;
        ## delete existing classes?
        $elem->class("") unless $keep_classes;
        ### my $ret
        ###   = $elem->name =~ /Code/ ? [$elem]
        ###   : keys( %$classes ) ? [ $wrap->( attributes $classes, $elem->content ) ]
        ###   :                     $elem->content;
        ## loop through the styles
        ### for my $style ( reverse grep { ref $_ } @styles ) {
        for my $style ( grep { ref $_ } @styles ) { # no reverse when not wrapping
            ### $ret = [ $wrap->( attributes + {%$style}, $ret ) ];
            while ( my @args = each %$style ) { # each key--value pair
                $elem->add_attribute(@args);    # add them to attributes
            }
        }
        return $elem;
        ### return $ret;
    },
  )

  # some other $out_format
  : (
    'Span|Div|Code|CodeBlock' => sub {
        my ( $elem, $action ) = @_;
        my $classes = $elem->class;
        ## Just remove trailing dots from classes
        return unless $classes =~ s/$class_re/$1/g;
        $elem->class( $classes );
        ## recurse into child elements
        unless ( $elem->name =~ /Code/ ) {
            transform( $elem->content, $action, $action );
        }
        return $elem;
    },
  );

 # compile the actions
 my $action = action \%actions;

 # Allow applying the action recursively
 $doc->transform( $action, $action );

 print $doc->to_json;

 sub get_styles {
    my ( $elem ) = @_;
    ## get a list of defined styles corresponding to classes,
    ## and/or dotted classes minus the dot
    ## Warning! Deep perl mumbo-jumbo here!
    return
        ## 3. get flat list of defined styles and/or a possibly empty list of de-dotted classes
      map { @{ _listify $style4class->{$_} // [ $_ =~ /$class_re/g ] } }
        ## 2. for each class with a defined style and/or each dotted class
      grep { exists( $style4class->{$_} ) || $_ =~ /$class_re/ }
        ## 1. split the class attribute on whitespace
      $elem->class =~ /\S+/g;   
 }

 __END__


 =encoding UTF-8

 =head1 NAME

 pandoc-class2style.pl - filter to translate single pandoc classes into attribute lists or LaTeX commands

 =head1 VERSION

 1.000

 =head1 SYNOPSIS

 pandoc -F pandoc-class2style.pl ...

 =head1 DESCRIPTION

 B<< pandoc-class2style.pl >> is a L<< Pandoc|http://pandoc.org/ >> filter which lets you use spans (or divs) with a single class in your source document and have the necessary LaTeX markup, DOCX custom styles, or HTML attributes of your choice injected during conversion. You still have to wrap the 'special' text in a span or div but since you only need to mark each span with a class with as few letters as you want the source becomes much less cluttered. It also becomes I<< much >> easier to produce multiple formats from the same Markdown source.

 You declare a mapping from short classes to LaTeX commands or environments, DOCX custom styles or HTML attributes in your YAML metadata as follows:

    ---
    class2style:
      latex:
        u:    uline
        uu:   uuline
        grc:  textgreek[variant=ancient]
        he:   texthebrew
        la:   textlatin
        sc:   textsc 
        blue: textcolor{blue}
      docx:
        - u:   Underlined
          uu:  DoubleUnderlined
          grc: Greek
          he:  Hebrew
          la:  Latin
          sc:  SmallCaps
        - blue
      html:
        u:
          class: uline
        uu:
          class: uuline
        grc:
          lang: grc
        he:
          lang: he
          dir: rtl
        la:
          lang: la
        sc:
          class: small-caps
    lang: en
    otherlangs:
    - grc
    - he
    - la
    mainfont: FreeSerif # or any other font you prefer
    xcolor: hyperref, svgnames
    ...

    [Underlined]{.u} [Double underlined]{.uu}

    [Ἑλληνιστής]{.grc}

    [עִבְרִית‎]{.he}

    [Lingua Romanica]{.la .sc}

    [I'm *blue*!]{.blue}

 Running pandoc with this filter gives the following outputs for the above:

 C<< pandoc -F pandoc-class2style.pl c2stest.md -t latex >>:

    \uline{Underlined} \uuline{Double underlined}

    \textgreek[variant=ancient]{Ἑλληνιστής}

    \texthebrew{עִבְרִית‎}

    \textlatin{\textsc{Lingua Romanica}}

    \textcolor{blue}{I'm \emph{blue}!}

 C<< pandoc -F pandoc-class2style.pl c2stest.md -t html5 >>:

    <p><span class="uline">Underlined</span>
    <span class="uuline">Double underlined</span></p>
    <p><span lang="grc">Ἑλληνιστής</span></p>
    <p><span lang="he" dir="rtl">עִבְרִית‎</span></p>
    <p><span class="small-caps" lang="la">Lingua Romanica</span></p>
    <p><span class="blue">I'm <em>blue</em>!</span></p>

 Finally I can't show the DOCX output here, but it is as if the Markdown had been like this:

    [Underlined]{custom-style="Underlined"}
    [Double underlined]{custom-style="DoubleUnderlined"}

    [Ἑλληνιστής]{custom-style="Greek"}

    [עִבְרִית‎]{custom-style="Hebrew"}

    [Lingua Romanica]{custom-style="LatinSmallCaps"}

    [I'm *blue*!]{custom-style="Blue"}

 =head2 Note on the terms I<< 'style' >>, I<< CSS style >> I<<< C<< custom-style >> >>> and I<< DOCX style >>

 I originally had three different filters for each of LaTeX, HTML and DOCX with essentially the same interface. When I combined them to make maintenance and configuration easier it was a bit of a problem what to call the combined filter. In the end I decided to use I<< style >> as the most general term, qualified as follows:

 The word I<< 'style' >> in scare quotes means any of the AST modifications performed by this filter in order to affect how elements with certain classes are rendered in any of the supported output formats. It does thus not necessarily refer to a DOCX style as applied through Pandoc's C<< custom-style >> attribute. In particular it does not refer to the HTML C<< style >> attribute. It is best practice to avoid that attribute and apply CSS styles through tag, class, id and attribute selectors in a separate style sheet. When talking about CSS the phrase I<< CSS style >> is used.

 Similarly the word I<<< C<< custom-style >> >>>, hyphenated but sometimes without code formatting is used when talking about the C<< custom-style >> attribute which tells Pandoc's docx writer to apply a particular named DOCX style to the contents of a span or div. Finally the phrase I<< DOCX style >> is used for the named styles which you can define, modify and apply to text elements in a word processor.

 =head2 Divs and spans

 In LaTeX mode 'styles' applied to spans become commands and 'styles' applied to divs become environments. This is not configurable. I have experimented with configuring this in the past and my experience wasn't good. If you really want to try to use a command as an environment you can try the L<<< I<< environ >> package|http://texdoc.net/pkg/environ >>>.

 Similarly DOCX C<< custom-style >>s become character styles for spans and paragraph styles for divs. This is part of L<<< Pandoc's built-in C<< custom-style >> feature|http://pandoc.org/MANUAL.html#custom-styles-in-docx-output >>>.

 Also note what was said on namespaces below!

 =head3 Multiple 'styles' per spanE<0x2f>div

 If you apply several classes with associated styles to the same span or div they are combined.

 In LaTeX mode the commands and environments are nested. The left-to-right order of the classes in the source is preserved, so that C<< [foo]{.bar .baz} >> becomes C<< \bar{\baz{foo}} >> but C<< [foo]{.baz .bar} >> becomes C<< \baz{\bar{foo}} >>. Similarly environments are nested with the one corresponding to the leftmost class becoming outermost and the one corresponding to the rightmost class becoming innermost.

 Because DOCX named styles aren't additive things become a little more complicated. Multiple class 'styles' become concatenated with the first letter of each component style capitalized, as seen in the C<< LatinSmallCaps >> example. You will need to define each such combined style in your reference-docx. At least you can let your C<< SmallCaps >> style inherit from the built-in C<< Small Caps >> style and your C<< LatinSmallCaps >> style inherit from your C<< SmallCaps >> style so that changes in the ancestor styles get reflected in the descendant styles.

 =head2 One 'style' per class

 Note that since there can only be one 'style' per class and output format you need to use a separate class for each LaTeX command or environment or for each DOCX character or paragraph style.

 =head3 Namespaces

 The one-style-per-class behavior is consistent with how things work in LaTeX where commands and environments share a namespace, and DOCX where character and paragraph styles also share the same namespace. If this bothers you when producing HTML remember that nothing stops you from defining HTML 'styles' with the same attributes, including classes, corresponding to different input classes. You can even use the YAML anchor--reference syntax to reduce typing, file size and errors:

    class2style:
      latex:
        he: texthebrew
        he-block: hebrew
      docx:
        he: Hebrew
        he-block: HebrewPara
      html:
        he: &hebrew
          lang: he
          dir:  rtl
        he-block: *hebrew

 Here C<< *hebrew >> is a reference which causes the value of the key C<< html--E<0x3e>he-block >> to be the same as the value of the key C<< html-he >> which is marked with the anchor C<< &hebrew >>.

 I don't know which of Pandoc andE<0x2f>or LibreOffice andE<0x2f>or Word imposes the limitation that DOCX paragraph and character styles can't have the same name, which is a little strange given the separation between those two kinds of styles.

 The LaTeX namespace limitation is due to the fact that the LaTeX implementation of an environment C<< foo >> involves defining the commands C<< \foo >> and C<< \endfoo >>. Why it was called C<< \foo >> and not C<< \beginfoo >> is anybody's guess...

 =head2 Applying 'styles' to Markdown output.

 By default 'styles' are only applied when the output format is one of C<< latex >>, C<< docx >>, C<< html >>, C<< html5 >> or C<< epub >>. You can override this by setting one of the metadata variables C<< class2style_html >> or C<< class2style_docx >> to a true value on the command line.

 In fact you can run with any output format and make this filter behave as if the output format had been C<< html >> or C<< docx >>. Just say:

    $ pandoc -F pandoc-class2style.pl -t markdown -M class2style_html ...

    $ pandoc -F pandoc-class2style.pl -t markdown -M class2style_docx ...

 There is no similar variable for LaTeX because Markdown markup inside the wrapped spans and divs will be broken if latex-mode output is converted to Markdown.

 =head3 Both HTML and DOCX attributes at the same time

 When applying 'styles' to markdown output you may wish to assign both HTML attributes and DOCX C<< custom-style >> attributes at the same time. There is an easy workaround for this: just include a "custom-style" attribute in your C<< class2style--E<0x3e>html--E<0x3e>CLASS >> metadata mapping and run with the C<< -M class2style_html >> switch on the command line.

    class2style:
      html:
        sc:
          class: 'small-caps'
          'custom-style': 'Small Caps'

 =head2 Keeping the original classes

 By default the existing classes of a span or div element which gets new arguments associated with it are deleted. This is so that you don't get any duplicated attributes if you first run the filter when producing Markdown output and then at a later time run the filter on the same document again, e.g. to also apply 'styles' to elements added later. This behavior can be overridden by passing the switch C<< -M class2style_keep >> on the command line.

 =head2 The (un)limits of LaTeX code injection

 Sometimes you need to pass extra arguments to a LaTeX command or environment. If those arguments come before the main argument(the one containing the span content) you can generally include it in your command line string as in the C<< Blue: textcolor{blue} >> example; anything you put as C<< COMMAND >> in your C<< CLASS: COMMAND >> metadata field will be put into the frame C<< \...{ >> and prepended to the span content as a raw latex string. In the rare cases where you need to put arguments after the span content argument you can replace C<< COMMAND >> with a mapping with the two keys C<< before >> and C<< after >>:

    CLASS:
      before: BEFORE
      after: AFTER

 In this case C<< BEFORE >> will be put into the same C<< \...{ >> frame before the content and C<< AFTER >> will be put into a C<< }... >> frame after the content, giving you C<< \BEFORE{CONTENT}AFTER >>.

 With environments (i.e. divs) you always need a mapping with the two keys C<< name >> and C<< args >> to pass arguments, with the value of C<< name >> being the environment name and the value of C<< args >> being the argument string:

    ---
    class2style:
      latex:
        grc-block:
          name: greek
          args: '[variant=ancient]'
    ...

    <div class="grc-block">

    | Ἄφοβον ὁ θεός,
    | ἀνύποπτον ὁ θάνατος
    | καὶ τἀγαθὸν μὲν εὔκτητον,
    | τὸ δὲ δεινὸν εὐεκκαρτέρητον

    </div>

 which thus becomes

    \begin{greek}[variant=ancient]

    Ἄφοβον ὁ θεός,\\
    ἀνύποπτον ὁ θάνατος\\
    καὶ τἀγαθὸν μὲν εὔκτητον,\\
    τὸ δὲ δεινὸν εὐεκκαρτέρητον

    \end{greek}

 In all these cases you may need to quote your values so that they don't confuse the YAML parser or Pandoc's Markdown parser which both will have a go at the values before the filter sees them. You may even have to wrap values containing LaTeX code both in outer single quotes for YAML and in inner backticks for Pandoc to ensure that they come intact to the filter:

    class2style:
      latex:
        foo: '`framebox[1.1\width]`'

 In fact you can write e.g. C<< '`\uline{`' >>. No extra backslash or opening brace will be added if you do, but then the twofold quoting is absolutely necessary.

 =head3 No per-element arguments

 Note that you will have to declare a separate class for each combination of command or environment and extra arguments. I have experimented with specifying custom arguments as attributes to a span or div in the past and in general it leads to cluttered Markdown source and complicated filter code with concomitant risk for errors. Even though the one class--one combination of command and arguments approach might mean more declarations in your metadata it keeps the body of your document cleaner. If the volume of the metadata declarations bother you remember that you can put metadata blocks anywhere, and that they are less in the way at the end of the file.

 =head2 Code

 This filter also works on inline code and code blocks.

 =head2 The "list of strings and mappings"

 As you may have noticed the value of the C<< docx >> key in our initial example is a list of strings and mappings. This can be done with any output format. String list items will be expanded into a single-element mapping C<< STRING: STRING >>, and then the list of mappings will be flattened into a single mapping, with later elements overriding earlier elements with the same key.

 =head2 The "dotted class" shortcut

 Finally you can in some cases forgo of the metadata declaration and instead append a period at the end of a class name. This will result in a command, environment, HTML class or DOCX style where the name is equal to the class name without the trailing period.

    [Framed]{.fbox.}

    \fbox{Framed}

    <p><span class="fbox">Framed</span></p>

    [Framed]{custom-style="Fbox"}

 =head1 PREREQUISITES

 In addition to L<< Pandoc|http://pandoc.org/ >> this filter requires the following perl modules:

 =over

 =item *

 Carp

 =item *

 Pandoc::Elements 0.33

 =item *

 Pandoc::Walker 0.27

 =item *

 autodie 2.29

 =item *

 perl 5.010001

 =item *

 strict

 =item *

 warnings

 =back

 =head2 New to Perl?

 This filter requires perl (minimum version as given above) and the Perl modules listed above to function. If you haven't used Perl before information on how to getE<0x2f>install perl andE<0x2f>or Perl modules can be found at the URLS below, which lead to the official information on these topics.

 Don't worry! If your operating system is Linux or Mac you probably already have a new enough version of perl installed. If you don't or if your operating system is Windows it is easy to install a recent version, and once you have perl installed installing modules is very easy. Just follow the instructions linked to below.

 Getting perl
 L<< https:E<0x2f>E<0x2f>www.perl.orgE<0x2f>get.html|https://www.perl.org/get.html >>

 (For Windows I recommend Strawberry Perl as module installation is easier there.)

 Installing Perl modules
 L<< http:E<0x2f>E<0x2f>www.cpan.orgE<0x2f>modulesE<0x2f>INSTALL.html|http://www.cpan.org/modules/INSTALL.html >>

 =head1 AUTHOR

 Benct Philip Jonsson ([email protected], L<< https:E<0x2f>E<0x2f>github.comE<0x2f>bpj|https://github.com/bpj >>)

 =head1 COPYRIGHT

 Copyright 2017- Benct Philip Jonsson

 =head1 LICENSE

 This is free software; you can redistribute it andE<0x2f>or modify it under the same terms as the Perl 5 programming language system itself. See L<< http:E<0x2f>E<0x2f>dev.perl.orgE<0x2f>licensesE<0x2f>|http://dev.perl.org/licenses/ >>.

 =cut
diff --git a/pandoc-class2style.pod b/pandoc-class2style.pod