Created
October 29, 2021 06:29
-
-
Save hochgi/a47042323decca6ac2d744c2fdc3c3c4 to your computer and use it in GitHub Desktop.
convert a tree structure (e.g. derived from XML) to a csv where same labeled nodes make full outer join of inner subtrees (empty cells as `null`)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scala> import com.hochgi.util._ | |
import com.hochgi.util._ | |
scala> val t = Tree("root", List(Content("root text"), Attribute("att","val")), List(Tree("c", List(Content("interesting content")), Nil), Tree("c", List(Attribute("innerAtt","innerVal"), Content("boring content")), Nil))) | |
val t: com.hochgi.util.Tree = Tree(root,List(Content(root text), Attribute(att,val)),List(Tree(c,List(Content(interesting content)),List()), Tree(c,List(Attribute(innerAtt,innerVal), Content(boring content)),List()))) | |
scala> csv(t) | |
val res0: List[Map[String,String]] = List(Map(root.@txt -> root text, root.#att -> val, root.c.@txt -> interesting content), Map(root.@txt -> root text, root.#att -> val, root.c.#innerAtt -> innerVal, root.c.@txt -> boring content)) | |
scala> render(res0) | |
val res1: String = | |
root.@txt,root.#att,root.c.@txt,root.c.#innerAtt | |
root text,val,interesting content,null | |
root text,val,boring content,innerVal |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.hochgi | |
package object util { | |
sealed trait Value | |
case class Attribute(k: String, v: String) extends Value | |
case class Content(s: String) extends Value | |
case class Tree(label: String, values: List[Value], children: List[Tree]) | |
def render(matrix: List[Map[String, String]]): String = { | |
val sb = new StringBuilder() | |
val headers = matrix.foldLeft(Set.empty[String])(_ union _.keySet).toList | |
sb ++= headers.mkString(",") | |
matrix.foreach { row => | |
var sep = '\n' | |
headers.foreach { col => | |
sb += sep | |
sep = ',' | |
sb ++= row.getOrElse(col, "null") | |
} | |
} | |
sb.result() | |
} | |
def mkRow(values: List[Value]): Map[String, String] = values.map { | |
case Attribute(k, v) => s"#$k" -> v | |
case Content(string) => "@txt" -> string | |
}.toMap | |
def nextRowUnderLabel(label: String)(row: Map[String, String]): Map[String, String] = | |
row.map { case (header, value) => s"$label.$header" -> value } | |
def csv(root: Tree): List[Map[String, String]] = root match { | |
case Tree(label: String, values: List[Value], Nil) => | |
List(nextRowUnderLabel(label)(mkRow(values))) | |
case Tree(label: String, values: List[Value], kids) => | |
val label2Trees: Map[String, List[Tree]] = kids.groupBy(_.label) | |
val perLabelAllNodesWithRecMatrix: List[List[List[Map[String, String]]]] = label2Trees.map { | |
case (_, children) => children.map(csv) | |
}.toList | |
val rowToJoin = mkRow(values) | |
join(perLabelAllNodesWithRecMatrix, List(rowToJoin)).map(nextRowUnderLabel(label)) | |
} | |
def join(labelsChildrenRowsCols: List[List[List[Map[String, String]]]], | |
thisMatrix: List[Map[String, String]]): List[Map[String, String]] = { | |
headAndTailOption(labelsChildrenRowsCols).fold(thisMatrix) { | |
case (firstLabelTrees, restOfLabelsTrees) => firstLabelTrees.flatMap { thatMatrix => | |
val newMatrix: List[Map[String, String]] = for { | |
thisRow <- thisMatrix | |
thatRow <- thatMatrix | |
} yield thisRow ++ thatRow | |
join(restOfLabelsTrees, newMatrix) | |
} | |
} | |
} | |
def headAndTailOption[T](list: List[T]): Option[(T, List[T])] = list match { | |
case Nil => None | |
case head :: tail => Some(head -> tail) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment