Last active
July 25, 2021 15:57
-
-
Save ryanohs/caf769dbe554ded0fec151a53a17f8ee to your computer and use it in GitHub Desktop.
A bullet journal parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
using Xunit; | |
namespace Parser | |
{ | |
public class Parser | |
{ | |
public Node Parse(string input) | |
{ | |
using var stream = new MemoryStream(); | |
using var writer = new StreamWriter(stream); | |
writer.Write(input); | |
writer.Flush(); | |
stream.Position = 0; | |
return Parse(stream); | |
} | |
public Node Parse(Stream stream) | |
{ | |
using var streamReader = new StreamReader(stream); | |
var root = new Node(); | |
var parser = new JournalStateMachine(); | |
var state = new ParserState(root); | |
while(!streamReader.EndOfStream) | |
{ | |
var c = (char)streamReader.Read(); // todo encoding? | |
if(c == '\r') continue; // ignore \r, only going to care about \n | |
state = parser.Next(state, c); | |
} | |
// ensure final trailing \n | |
parser.Next(state, '\n'); | |
return root; | |
} | |
} | |
public class JournalStateMachine | |
{ | |
public enum State | |
{ | |
Start, | |
End, | |
Tab, | |
Spaces, | |
Bullet, | |
PreContent, | |
Content, | |
} | |
public enum Token | |
{ | |
Space, | |
Tab, | |
Bullet, | |
Newline, | |
Other, | |
} | |
public record Transition(State State, Token Token); | |
public Dictionary<Transition, State> Transitions = new() | |
{ | |
{ new(State.Start, Token.Tab), State.Tab }, | |
{ new(State.Start, Token.Space), State.Spaces }, | |
{ new(State.Start, Token.Bullet), State.Bullet }, | |
{ new(State.Start, Token.Other), State.Content }, | |
{ new(State.Start, Token.Newline), State.Start }, | |
{ new(State.Tab, Token.Tab), State.Tab }, | |
{ new(State.Tab, Token.Space), State.Spaces }, | |
{ new(State.Tab, Token.Bullet), State.Bullet }, | |
{ new(State.Tab, Token.Other), State.Content }, | |
{ new(State.Tab, Token.Newline), State.Start }, | |
{ new(State.Spaces, Token.Space), State.Spaces }, | |
{ new(State.Spaces, Token.Tab), State.Tab }, | |
{ new(State.Spaces, Token.Bullet), State.Bullet }, | |
{ new(State.Spaces, Token.Other), State.Content }, | |
{ new(State.Spaces, Token.Newline), State.Start }, | |
{ new(State.Bullet, Token.Space), State.PreContent }, | |
{ new(State.Bullet, Token.Tab), State.PreContent }, | |
{ new(State.Bullet, Token.Bullet), State.Bullet }, | |
{ new(State.Bullet, Token.Other), State.Content }, | |
{ new(State.Bullet, Token.Newline), State.End }, | |
// PreContent -- allow for any whitespace between bullet and content | |
{ new(State.PreContent, Token.Space), State.PreContent }, | |
{ new(State.PreContent, Token.Tab), State.PreContent }, | |
{ new(State.PreContent, Token.Bullet), State.Content }, | |
{ new(State.PreContent, Token.Other), State.Content }, | |
{ new(State.PreContent, Token.Newline), State.Start }, | |
{ new(State.Content, Token.Space), State.Content }, | |
{ new(State.Content, Token.Tab), State.Content }, | |
{ new(State.Content, Token.Bullet), State.Content }, | |
{ new(State.Content, Token.Other), State.Content }, | |
{ new(State.Content, Token.Newline), State.End }, | |
}; | |
public ParserState Next(ParserState state, char c) | |
{ | |
var token = GetToken(c); | |
if(!Transitions.TryGetValue(new Transition(state.CurrentState, token), out var newState)) | |
{ | |
throw new Exception($"Unhandled transition: ({state.CurrentState}, {token}) Previously parsed = '{state.RawLine}' Next = '{c}'"); | |
} | |
return newState switch | |
{ | |
State.Start => Start(state), | |
State.End => End(state), | |
State.Tab => Tab(state, c), | |
State.Spaces => Spaces(state, c), | |
State.Bullet => Bullet(state, c), | |
State.PreContent => PreContent(state, c), | |
State.Content => Content(state, c), | |
_ => throw new ArgumentOutOfRangeException($"Unhandled state {newState}") | |
}; | |
} | |
private ParserState Start(ParserState state) | |
{ | |
state.RawLine.Clear(); | |
state.Content.Clear(); | |
return state with | |
{ | |
CurrentIndentDepth = 0, | |
CurrentState = State.Start, | |
NodeType = NodeType.Title | |
}; | |
} | |
private ParserState Tab(ParserState state, char c) | |
{ | |
state.RawLine.Append(c); | |
return state with | |
{ | |
CurrentState = State.Tab, | |
CurrentIndentDepth = state.CurrentIndentDepth + 1, | |
CurrentSpacesCount = 0 | |
}; | |
} | |
private ParserState Spaces(ParserState state, char c) | |
{ | |
state.RawLine.Append(c); | |
return state with | |
{ | |
CurrentState = State.Spaces, | |
CurrentSpacesCount = (state.CurrentSpacesCount == 3) ? 0 : state.CurrentSpacesCount + 1, | |
CurrentIndentDepth = (state.CurrentSpacesCount == 3) ? state.CurrentIndentDepth + 1 : state.CurrentIndentDepth, | |
}; | |
} | |
private ParserState Bullet(ParserState state, char c) | |
{ | |
state.RawLine.Append(c); | |
return state with | |
{ | |
CurrentState = State.Bullet, | |
NodeType = GetNodeType(c) | |
}; | |
} | |
private ParserState PreContent(ParserState state, char c) | |
{ | |
state.RawLine.Append(c); | |
return state with | |
{ | |
CurrentState = State.PreContent | |
}; | |
} | |
private ParserState Content(ParserState state, char c) | |
{ | |
state.RawLine.Append(c); | |
state.Content.Append(c); | |
return state with | |
{ | |
CurrentState = State.Content | |
}; | |
} | |
private ParserState End(ParserState state) | |
{ | |
var (indentDepth, parent) = ComputeIndentDepthAndParent(state); | |
var node = new Node() | |
{ | |
Parent = parent, | |
IndentDepth = indentDepth, | |
Type = state.NodeType, | |
Content = state.Content.ToString(), | |
RawLine = state.RawLine.ToString() | |
}; | |
parent.Children.Add(node); | |
state.RawLine.Clear(); | |
state.Content.Clear(); | |
return Start(state with { | |
PreviousNode = node | |
}); | |
} | |
private (int indentDepth, Node parent) ComputeIndentDepthAndParent(ParserState state) | |
{ | |
var indentDepth = state.CurrentIndentDepth; | |
if(state.NodeType == NodeType.Title) | |
{ | |
indentDepth--; | |
} | |
if(state.CurrentSpacesCount > 0) | |
{ | |
indentDepth++; | |
} | |
var parent = state.PreviousNode; | |
while(indentDepth <= parent.IndentDepth) | |
{ | |
parent = parent.Parent; | |
} | |
return (indentDepth, parent); | |
} | |
private Token GetToken(char c) | |
{ | |
return c switch | |
{ | |
' ' => Token.Space, | |
'\t' => Token.Tab, | |
'-' => Token.Bullet, | |
'*' => Token.Bullet, | |
'o' => Token.Bullet, | |
'!' => Token.Bullet, | |
'\r' => Token.Newline, | |
'\n' => Token.Newline, | |
_ => Token.Other | |
}; | |
} | |
private NodeType GetNodeType(char c) | |
{ | |
return c switch | |
{ | |
'-' => NodeType.Note, | |
'*' => NodeType.Task, | |
'o' => NodeType.Event, | |
'!' => NodeType.Inspiration, | |
_ => throw new ArgumentOutOfRangeException(nameof(c), c, null) | |
}; | |
} | |
} | |
public record ParserState | |
{ | |
public ParserState(Node root) | |
{ | |
PreviousNode = root; | |
} | |
public JournalStateMachine.State CurrentState { get; init; } = JournalStateMachine.State.Start; | |
public int CurrentIndentDepth { get; init; } | |
public int CurrentSpacesCount { get; init; } | |
public Node PreviousNode { get; init; } | |
public NodeType NodeType { get; init; } | |
public StringBuilder RawLine { get; init; } = new(); | |
public StringBuilder Content { get; init; } = new(); | |
} | |
public record Node | |
{ | |
public string RawLine { get; init; } | |
public string Content { get; init; } | |
public int IndentDepth { get; init; } = -2; // root is -2. Top level Title nodes are -1. | |
public NodeType Type { get; init; } | |
public Node Parent { get; init; } | |
public List<Node> Children { get; init; } = new (); | |
} | |
public enum NodeType | |
{ | |
Title, | |
Note, | |
Task, | |
Event, | |
Inspiration | |
} | |
public class ParserTests | |
{ | |
[Fact] | |
public void ParseLine() | |
{ | |
var line = "- First"; | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal(1, root.Children.Count); | |
Assert.Equal("First", root.Children.Single().Content); | |
} | |
[Fact] | |
public void ParseMultipleWords() | |
{ | |
var line = "- Multiple Words"; | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal("Multiple Words", root.Children.Single().Content); | |
} | |
[Fact] | |
public void SpecialTokensInsideContent() | |
{ | |
var line = "- !@#$%^&*()\t-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal("!@#$%^&*()\t-_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", root.Children.Single().Content); | |
} | |
[Theory] | |
[InlineData("- Parent\n\t- Child")] | |
[InlineData("- Parent\n\t\t- Child")] // excess tab depth | |
public void Children(string line) | |
{ | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal(1, root.Children.Count); | |
var parent = root.Children.Single(); | |
Assert.Equal("Parent", parent.Content); | |
Assert.Equal(1, parent.Children.Count); | |
Assert.Equal("Child", parent.Children.Single().Content); | |
} | |
[Theory] | |
[InlineData("- Parent\n - Child")] | |
[InlineData("- Parent\n - Child")] // excess spaces | |
public void Spaces(string line) | |
{ | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal(1, root.Children.Count); | |
var parent = root.Children.Single(); | |
Assert.Equal("Parent", parent.Content); | |
Assert.Equal(1, parent.Children.Count); | |
Assert.Equal("Child", parent.Children.Single().Content); | |
} | |
[Theory] | |
[InlineData("- Parent\n\t- Child\n- Sibling\n")] | |
[InlineData("- Parent\n\t\t- Child\n- Sibling\n")] // excess tab depth | |
public void Promotion(string line) | |
{ | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal(2, root.Children.Count); | |
var parent = root.Children[0]; | |
Assert.Equal("Parent", parent.Content); | |
Assert.Equal(1, parent.Children.Count); | |
Assert.Equal("Child", parent.Children.Single().Content); | |
var sibling = root.Children[1]; | |
Assert.Equal("Sibling", sibling.Content); | |
} | |
[Theory] | |
[InlineData("Title Line", NodeType.Title)] | |
[InlineData("- Note", NodeType.Note)] | |
[InlineData("* Task", NodeType.Task)] | |
[InlineData("o Event", NodeType.Event)] | |
[InlineData("! Inspiration", NodeType.Inspiration)] | |
public void NodeTypes(string line, NodeType type) | |
{ | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal(type, root.Children.Single().Type); | |
} | |
[Fact] | |
public void MultilineSample() | |
{ | |
var input = | |
@"Node Types | |
- Note | |
* Task | |
o Event | |
! Inspiration | |
Children | |
- A | |
- B | |
- C | |
- D | |
- E | |
Crazy Tabs | |
- H | |
-I | |
-J | |
- K | |
-L | |
- M"; | |
var parser = new Parser(); | |
var root = parser.Parse(input); | |
var nodeTypes = root.Children[0]; | |
Assert.Equal("Node Types", nodeTypes.Content); | |
Assert.Equal(NodeType.Note, nodeTypes.Children[0].Type); | |
Assert.Equal(NodeType.Task, nodeTypes.Children[1].Type); | |
Assert.Equal(NodeType.Event, nodeTypes.Children[2].Type); | |
Assert.Equal(NodeType.Inspiration, nodeTypes.Children[3].Type); | |
var childrenNode = root.Children[1]; | |
Assert.Equal("A", childrenNode.Children[0].Content); | |
Assert.Equal("B", childrenNode.Children[0].Children[0].Content); | |
Assert.Equal("C", childrenNode.Children[0].Children[1].Content); | |
Assert.Equal("D", childrenNode.Children[1].Content); | |
Assert.Equal("E", childrenNode.Children[1].Children[0].Content); | |
var crazyTabs = root.Children[2]; | |
Assert.Equal("H", crazyTabs.Children[0].Content); | |
Assert.Equal("I", crazyTabs.Children[0].Children[0].Content); | |
Assert.Equal("J", crazyTabs.Children[0].Children[1].Content); | |
Assert.Equal("K", crazyTabs.Children[1].Content); | |
Assert.Equal("L", crazyTabs.Children[1].Children[0].Content); | |
Assert.Equal("M", crazyTabs.Children[2].Content); | |
} | |
[Fact] | |
public void SpacesAndTabs() | |
{ | |
var input = | |
@" | |
Mixed Whitespace | |
- H | |
-I | |
-J | |
- K | |
-L | |
- M"; | |
var parser = new Parser(); | |
var root = parser.Parse(input); | |
var mixed = root.Children[0]; | |
Assert.Equal("H", mixed.Children[0].Content); | |
Assert.Equal("I", mixed.Children[0].Children[0].Content); | |
Assert.Equal("J", mixed.Children[0].Children[1].Content); | |
Assert.Equal("K", mixed.Children[1].Content); | |
Assert.Equal("L", mixed.Children[1].Children[0].Content); | |
Assert.Equal("M", mixed.Children[2].Content); | |
} | |
[Fact] | |
public void MultipleBullet() | |
{ | |
var line = "--! Test"; | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal(NodeType.Inspiration, root.Children.Single().Type); | |
} | |
[Fact] | |
public void IgnoreTrailingNewLines() | |
{ | |
var line = "- Test\n\n\n"; | |
var parser = new Parser(); | |
var root = parser.Parse(line); | |
Assert.Equal(1, root.Children.Count); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment