Skip to content

Instantly share code, notes, and snippets.

@talyian
Last active September 28, 2018 15:22
Show Gist options
  • Save talyian/8b9bb45ea7043eab038000de741a175f to your computer and use it in GitHub Desktop.
Save talyian/8b9bb45ea7043eab038000de741a175f to your computer and use it in GitHub Desktop.
POC for New Breve Idea
Breve was designed as a backwards-compatible Blitz replacement that does safe context-aware sanitization.
The current architecture uses Blitz's parser frontend + Latte as a context-aware backend to achieve this goal. This was because I figured writing a context-aware HTML+macro parser was harder than re-implementing Blitz's logic flows (blocks, variable lookups, partial templates, etc.) in Latte.
As it turns out, reimplementing logic is a difficult problem. An alternate solution would be to write our own Blitz-compatible frontend that just adds context-aware output and delegates to Blitz for the logic implementation. This is a Proof of concept that shows it isn't that bad in a language that makes writing parsers easy.
open System
open System.Text.RegularExpressions
open System.Collections.Generic
type HtmlToken = | Id | Whitespace | FullComment | Macro | Sym | CommentStart | Begin | Info
with override this.ToString() = sprintf "%A" this
type HtmlContext =
| Html | Comment
| Tag | TagBegin | TagCloseBegin
| TagEnding | TagAttr1 | TagAttrval
| TagStringD | TagStringS
with override this.ToString() = sprintf "%A" this
type JsContext =
| Regex | JsComment | SingleQuote | DoubleQuote | JsNone
with override this.ToString() = sprintf "%A" this
module HTMLTokens =
let tokenize str =
let tokens = Regex.Matches(str, @"(\w+)|(\s+)|(<!--.*?-->)|({{.*?}})|(.)", RegexOptions.Singleline)
let tok (token:Match) = seq {
let str = token.Groups.[0]
let tokentype =
if token.Groups.[1].Length > 0 then HtmlToken.Id
elif token.Groups.[2].Length > 0 then Whitespace
elif token.Groups.[3].Length > 0 then FullComment
elif token.Groups.[4].Length > 0 then Macro
else Sym
if tokentype = FullComment then yield ("", CommentStart, str.Index)
yield (str.Value, tokentype, str.Index)
}
[for m:Match in tokens do yield! tok m]
// Calculate HTML lexer context in a token stream -- this extracts lexical html state like tags, attributes, and text
// given token in a stream and a previous context, return the new context
let contextmap context (token, toktype, idx) =
match context, token, toktype with
| (Html, "<", _) -> TagBegin
| (Html, _, CommentStart) -> Comment
| (Comment, _, FullComment) -> Html
| (Html, _, _) -> Html
| (TagBegin, "/", _) -> TagCloseBegin
| (TagBegin, tag, HtmlToken.Id) -> Tag
| (TagBegin, macro, Macro) -> Tag
| (TagCloseBegin, tag, HtmlToken.Id) -> TagEnding
| (Tag, attrname, HtmlToken.Id) -> TagAttr1
| (Tag, macro, Macro) -> TagAttr1
| (Tag, ">", _) -> Html
| (Tag, "/", _) -> TagEnding
| (TagEnding, ">", _) -> Html
| (TagAttr1, "-", _) -> Tag
| (TagAttr1, "=", _) -> TagAttrval
| (TagAttr1, _, Whitespace) -> TagAttr1
| (TagAttr1, _, HtmlToken.Id) -> TagAttr1
| (TagAttr1, ">", _) -> Html
| (TagAttr1, "/", _) -> TagEnding
| (TagAttrval, "\"", _) -> TagStringD
| (TagAttrval, "'", _) -> TagStringS
| (TagAttrval, _, Whitespace) -> TagAttrval
| (TagAttrval, ">", _) -> Html
| (TagAttrval, value, _) -> TagAttrval
| (TagStringD, "\"", _) -> Tag
| (TagStringD, _, _) -> TagStringD
| (TagStringS, "'", _) -> Tag
| (TagStringS, _, _) -> TagStringS
| (n, tagInfo, Info) -> n
| (n, _, Macro) -> n
| (n, _, Whitespace) -> n
| (n, _, _) -> failwithf "invalid transition: %A" (n, idx)
let withTag cmap state y =
let (_, _oldcontext, _oldtoken) = state
let newcon = cmap _oldcontext y
// printfn "%A" (_oldcontext, newcon, y)
(_oldcontext, newcon, y)
let context_and_token tokens = List.scan (withTag contextmap) (Html, Html, ("", Begin, 0)) tokens
let elementStream token_stream =
let mutable context = "html"
let mutable curtag = ""
let pushElement tag = curtag <- tag; if tag = "script" then context <- "js-beginning"
let popElement token =
let tag = curtag
curtag <- ""
match context, tag, token with
| "html", "script", _ -> Some("js-beginning")
| "js-beginning", "script", ">" -> Some("js")
| "js", "/script", _ -> Some("html")
| _ -> None
let mutable curattr = ""
let pushAttr name = curattr <- name
let startAttr () =
let name = curattr.ToLower()
if context = "html" then
if name.Length > 1 && name.[0..1] = "on" then Some("js-attr")
elif name = "href" then Some("url-attr")
elif name = "src" then Some("url-attr")
else Some("attr")
else None
let clearAttr () =
match context with
| "attr"
| "js-attr"
| "url-attr" -> context <- "html"
| _ -> ()
seq {
for tok in token_stream do
let prev_context = context
match tok with
// managing tag stack
| TagBegin, Tag, (name, HtmlToken.Id, i) -> pushElement name
| TagCloseBegin, TagEnding, (name, HtmlToken.Id, i) -> pushElement ("/" + name)
| Tag, Html, _
| TagEnding, Html, _
| TagAttr1, Html, _ ->
match popElement(let (a,b,(c,d,i)) = tok in c) with | Some(c) -> context <- c | None -> ()
// attribute stack
| Tag, TagAttr1, (name, HtmlToken.Id, i) -> pushAttr name
| TagAttr1, TagAttrval, _ -> match startAttr() with | Some(c) -> context <- c | None -> ()
| _, Tag, _ -> clearAttr();
| _ -> ()
yield prev_context, context, tok }
// get js-quote context
let jsStream element_stream = seq {
let mutable context = JsNone
for element in element_stream do
let (c1, c2, (tok1, tok2, token)) = element
match c1, c2, (tok1 = tok2) with
| "js", "js", true
| "js-attr", "js-attr", true ->
let t, _, i = token
match context, t with
| JsNone, "'" -> context <- SingleQuote
| JsNone, "\"" -> context <- DoubleQuote
| SingleQuote, "'" -> context <- JsNone
| DoubleQuote, "\"" -> context <- JsNone
| _ -> ()
| _ -> ()
yield context, element
}
type BlitzTemplate (content) =
let htmltokens = HTMLTokens.tokenize content
let htmlcontexts = HTMLTokens.context_and_token htmltokens
let jscontexts = htmlcontexts |> HTMLTokens.elementStream |> HTMLTokens.jsStream |> Seq.toList
member this.Content = content
member val HtmlContexts = HTMLTokens.context_and_token htmltokens
member val JsContexts = htmlcontexts |> HTMLTokens.elementStream |> HTMLTokens.jsStream |> Seq.toList
let action = Environment.GetCommandLineArgs() |> Seq.rev |> Seq.item 1
let filename = Environment.GetCommandLineArgs() |> Seq.last
let content = filename |> IO.File.ReadAllText
let template = BlitzTemplate(content)
let print_tokens (template:BlitzTemplate) = Seq.iter (printfn "%O") template.JsContexts
let print_escaped (template:BlitzTemplate) =
for jstype, (html1, html2, (xml1, xml2, (t, ttype, pos))) in template.HtmlContexts |> HTMLTokens.elementStream |> HTMLTokens.jsStream do
let htmlContext = match xml2 with
| TagAttrval -> "attrval"
| TagAttr1 -> "attrname"
| Tag -> "tagname"
| Html -> "html"
| TagStringD -> "attrdstring"
| TagStringS -> "attrsstring"
| Comment -> "comment"
| _ -> "unknown"
let htmlContext = if html2 = "js" then "cdata" else htmlContext
let semanticContext = match jstype, html2 with
| _, "url-attr" -> "url"
| JsComment, _ -> "jscomment"
| SingleQuote, _ -> "jsstring"
| DoubleQuote, _ -> "jsstring"
| JsContext.Regex, _ -> "jsregex"
| JsNone, "js" -> "js"
| JsNone, "js-attr" -> "js"
| _ -> "none"
let m = Regex.Match(t, "{{(\$[\w.]+)}}")
if m.Success then
printf "\x1b[0;32m";
if semanticContext = "none" then
printf "{{%O(%O)}}" htmlContext (m.Groups.[1])
elif htmlContext = "cdata" then
printf "{{%O(%O)}}" semanticContext (m.Groups.[1])
else
printf "{{%O(%O(%O))}}" htmlContext semanticContext (m.Groups.[1])
printf "\x1b[0;m";
else
printf "%s" t
print_escaped template
docker-run:
docker run --rm -v ${PWD}:/work fsharp bash -c 'cd /work; fsharpc everb.fs --nowarn:58 --nologo && cli everb.exe example.tpl'
run:
fsharpc everb.fs --nowarn:58 && mono everb.exe example.tpl
jimmyt@:~/git/eat/everb (master) $ make
docker run --rm -v /Users/jimmyt/git/eat/everb:/work fsharp bash -c 'cd /work; fsharpc everb.fs --nowarn:58 --nologo && cli everb.exe example.tpl'
{{html($html)}}
<A {{attrname($tag)}}/>
<{{tagname($tag)}} />
<div {{attrname($tag)}}={{attrval($tagvalue)}} b = {{attrval($tagvalue)}}/>
<A a = {{attrval($val)}} b={{attrval($val)}} />
<B a = '{{attrsstring($val)}}' b ="{{attrdstring($val)}}" />
<I a='{{attrsstring($val)}}' b ="{{attrdstring($val)}}" />
<A href='{{attrsstring(url($url))}}' href={{attrval(url($url))}} />
<B onclick={{attrval(js($js))}} onclick="{{attrdstring(js($js))}}='{{attrdstring(jsstring($js2))}}'" />
<script>
var x = {{js($js)}};
var y = "{{jsstring($jss)}}";
// var z = {{js($comment)}};
var w = /foo["']{{jsstring($regex)}}/g;
</script>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment