Created
February 6, 2025 17:39
-
-
Save kmicinski/9986f2e78b5740eb2bf399f6bc40a090 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
;; Algebraic Data II | |
;; | |
;; CIS352 2/6/25 | |
;; Note: for this lecture I am transliterating some very nice | |
;; notes from https://abhiroop.github.io/Haskell-Red-Black-Tree/ | |
;; (written by Abhiroop Sarkar) | |
;; Last class we talked about algebraic data, the kind of | |
;; data we can build in Racket by trees of tagged S-expressions | |
;; | |
;; We mentioned that these data types were, among other things | |
;; (like builtin types for numbers, etc...), built using sum | |
;; and product types, allowing us to write types such as | |
;; A + B * C (either A or a pair of B and C). | |
;; In this class, our goal is to practice writing functions | |
;; over algebraic datatypes. | |
;; | |
;; Our goal is to implement Red/Black Trees (RB-Trees) | |
;; | |
;; RB-Trees are balanced binary search trees (BSTs). | |
;; | |
;; Balanced BSTs support O(log(n)) insertion and lookup, | |
;; and can form the basis for other data structures | |
;; such as sets / maps (e.g., make the nodes be key/ | |
;; value pairs). Here, n is the number of nodes in the | |
;; tree, i.e., the amount of data being stored by the tree. | |
;; | |
;; The issue with naive BSTs is that chaotic insertion | |
;; orders can cause the tree's performance to devolve to | |
;; O(n) performance. For example, inserting '(1 2 3 4 ...) | |
;; into a naive BST would result in (essentially) a linked | |
;; list: a tree with no left nodes, only a long right | |
;; spine of linked nodes. This is essentially a linked list, | |
;; except for the fact that there is a wasted space for the | |
;; left child (storing the empty tree in each instance). | |
;; | |
;; Let's code this up to understand. | |
;; Warmup: we sketch a naive BST, which stores sets of | |
;; values (we assume they are ordered via >, < and equal? | |
(define (naive-tree? t) | |
(match t | |
['empty #t] | |
[`(node ,v ,(? naive-tree? t0) ,(? naive-tree? t1)) #t] | |
[_ #f])) | |
;; naive BST insertion walks down to the appropriate | |
;; place and adds the node. | |
(define (naive-tree-insert t i) | |
(match t | |
['empty `(node ,i empty empty)] | |
[`(node ,v ,t0 ,t1) | |
(cond [(equal? v i) t] ;; value already in the tree | |
[(> i v) `(node ,v ,t0 ,(naive-tree-insert t1 i))] | |
[(< i v) `(node ,v ,(naive-tree-insert t0 i) ,t1)])])) | |
;; predicate: is i a member of t? | |
(define (naive-tree-member? t i) | |
(match t | |
['empty #f] ;; hit the end of the search | |
[`(node ,v ,_ ,_) #:when (equal? v i) #t] | |
[`(node ,v ,t0 ,_) #:when (< v i) | |
(naive-tree-member? t0 i)] | |
[`(node ,v ,_ ,t1) #:when (> v i) | |
(naive-tree-member? t1 i)])) | |
;; insert multiple elements (ls) into the naive | |
;; BST bst | |
(define (insert-multiple ls bst) | |
(match ls | |
['() bst] ;; just return the starting bst | |
[`(,fst . ,tl) ;; insert fst followed by tl (the rest) | |
(let ([after-inserting-fst (naive-tree-insert bst fst)]) | |
;; call insert-multiple to do the rest of the work | |
(insert-multiple tl after-inserting-fst))])) | |
;; DEMO: inserting 10 elements into the naive BST | |
#;(insert-multiple (range 10) 'empty) | |
;; DEMO: build a tree of size 10k | |
(displayln "beginning timer") | |
(let ([t (time | |
(displayln "starting to build tree of 10k elements") | |
(let ([result (insert-multiple (range 1 10000) 'empty)]) | |
(displayln "done") | |
(displayln "now querying to find all 10k elements (hope to see #t):") | |
(andmap (λ (x) (naive-tree-member? result x)) (range 1 10000))))]) | |
(displayln "total printed above")) | |
;; Red-Black Trees | |
;; | |
;; Naive BSTs suffer some serious impediments | |
;; in practice--especially when insertion order may follow | |
;; chaotic patterns that degrade performance. | |
;; | |
;; A solution is balanced BSTs. The general idea in a balanced | |
;; BST is to occasionally "rebalance" the BST when it gets too | |
;; far out of balance. There are myriad implementations of | |
;; balanced BSTs, all of which balance aspects such as amortization | |
;; (the degree to which you do work now to save time later) | |
;; mutability/persistence, and etc. | |
;; | |
;; AVL trees, for example, maintain a very strict balancing | |
;; profile--doing lots of work to ensure that the tree is | |
;; strictly balanced at every point in time. | |
;; | |
;; RB trees, by contrast, allow the tree to go a bit out of | |
;; balance--but not too far. We will bound the imbalance to | |
;; a factor of 2x by construction of the properties of the | |
;; RB tree. This enables us to do a bit less work at insertion | |
;; time, and ends up being what we want for many common | |
;; workloads. | |
;; The RB tree will look quite a bit like the naive tree, | |
;; except nodes will be either red or black: | |
(define (rbtree? t) | |
(match t | |
['(black empty) #t] | |
[`(red ,v ,(? rbtree? t0) ,(? rbtree? t1)) #t] | |
[`(black ,v ,(? rbtree? t0) ,(? rbtree? t1)) #t] | |
[_ #f])) | |
;; Notice that all nodes begin with a tag, 'red or 'black | |
;; Also, empty nodes must be colored black--no exceptions. | |
;; The RB tree invariants are: | |
;; 1. If a node is red, all its children are black. | |
;; 2. Every path from the root to every empty tree | |
;; contains the same number of black nodes. | |
;; We say the "black height" along all paths | |
;; is equivalent. | |
;; 3. The root and leaves are both black. | |
;; | |
;; Warmup: measure the height and the black-height of | |
;; RB-trees | |
(define (rbtree-height t) | |
'todo) | |
(define (rbtree-black-height t) | |
'todo) | |
;; is e a member of t? | |
;; this is very similar to the BST search | |
(define (rbtree-member? t e) | |
(match t | |
['empty #f] | |
[`(,color ,v ,t0 ,t1) | |
(cond [(equal? v e) #t] | |
[(< e v) (rbtree-member? t0 e)] | |
[(> e v) (rbtree-member? t1 e)])])) | |
;; The RB invariants ensure the tree doesn't get too | |
;; far out of balance. In fact, the imbalance is | |
;; bounded by a factor of the tree height: if the | |
;; tree's minimum height (number of levels between | |
;; the root and an empty node) is n, the *maximum* | |
;; height is bounded to 2n. | |
;; | |
;; Proof (2x bound, informal proof): | |
;; | |
;; The reason is that because of property (2): | |
;; Consider every possible path p_i ∈ paths from the | |
;; root to any empty subtree. By construction, the | |
;; length of each p_i is equivalent. Let the black | |
;; height of all p_i be h_b (a natural number). The | |
;; path is comprised of only black nodes and red | |
;; nodes. By property (1), no red node may have a | |
;; red child. The total path length is the sum | |
;; of its black height plus its red height. | |
;; The red height may not be larger then h_b, because | |
;; a red node may not have a red child. Let this | |
;; red height be h_r <= h_b. Then, the maximum path | |
;; length is h_r + h_b <= 2 * h_b. QED. | |
;; RB Tree insertion | |
(define (rbtree-insert t i) | |
;; Insertion into the tree will follow similar | |
;; logic to the naive BST--we go either left | |
;; or right depending on if the value is less | |
;; than or greater than the value we wish to | |
;; insert. However, we also have to account | |
;; for the invariants. We will take the following | |
;; strategy: temporarily break the invariants | |
;; at the place where the node is inserted, | |
;; and then gradually restore the invariants on | |
;; the way back up to ensure that in the end | |
;; we maintain the RB Tree invariants. | |
;; We break this down into three phases: | |
;; - First, we do the insertion, traversing | |
;; into the right path through the tree. | |
;; -> Along the way, we build up calls to | |
;; `balance`, which ensures that--after | |
;; insertion occurs--we fix up the black | |
;; height to be homogenous across all paths. | |
;; - Next, once the insertion finally occurs: | |
;; (either the node already exists or a new node | |
;; is created with two empty children nodes) | |
;; we traverse back up the tree, fixing any | |
;; issues where we have two red nodes in a row. | |
;; We juggle the tree around by carefully | |
;; considering the different possible cases | |
;; in which invariant (1) may have been violated. | |
;; - Last, we may end up with a red node at the root | |
;; as a product of the last phase: we can safely | |
;; color it black to restore invariant (1) | |
;; the result of our balancing function might produce | |
;; a red node at the top, which can be safely recolored | |
;; black. | |
(define (recolor-root-black root) | |
(match root | |
[`(,color ,v ,t0 ,t1) `(black ,v ,t0 ,t1)])) | |
;; Once insertion occurs, we need to restore potentially | |
;; broken invariants where we have two red nodes in a row. | |
;; The key insight here is that this will happen in | |
;; four specific cases: | |
;; - black on top, red/red pair along left/left path | |
;; - black on top, red/red pair along left/right path | |
;; - black on top, red/red pair along the right/left path | |
;; - black on top, red/red pair along the right/right path | |
;; | |
;; no other instances matter! (We will recolor the root | |
;; black to avoid a final red/red occurence, say, at the | |
;; root.) Thus, we pass them all through. | |
(define (balance color v t0 t1) | |
(match `(,color ,t0 ,v ,t1) | |
;; balance B (T R (T R a x b) y c) z d = T R (T B a x b) y (T B c z d) | |
[`(black (red ,v0 (red ,v1 ,t0 ,t1) ,t2) ,v2 ,t3) | |
`(red ,v1 (black ,v0 ,t0 ,t1) (black ,v2 ,t2 ,t3))] | |
;; balance B (T R a x (T R b y c)) z d = T R (T B a x b) y (T B c z d) | |
[`(black (red ,v0 ,t0 (red ,v1 ,t1 ,t2)) ,v2 ,t3) | |
`(red ,v1 (black ,v0 ,t0 ,t1) (black ,v2 ,t2 ,t3))] | |
;; balance B a x (T R (T R b y c) z d) = T R (T B a x b) y (T B c z d) | |
[`(black ,t0 ,v0 (red ,v1 (red ,v2 ,t1 ,t2) ,t3)) | |
`(red ,v1 (black ,v0 ,t0 ,t1) (black ,v2 ,t2 ,t3))] | |
;; balance B a x (T R b y (T R c z d)) = T R (T B a x b) y (T B c z d) | |
[`(black ,t0 ,v0 (red ,v1 ,t1 (red ,v2 ,t2 ,t3))) | |
`(red ,v1 (black ,v0 ,t0 ,t1) (black ,v2 ,t2 ,t3))] | |
[_ `(,color ,v ,t0 ,t1)])) | |
;; recursive helper function, does the insertion | |
;; into the list like a binary search tree | |
;; | |
;; Intuitively: this function walks down into the | |
;; tree until we either find the element or the empty | |
;; tree. We leave a bunch of calls to `balance` on the | |
;; stack to ensure that we rebalance up the path | |
;; through which the insertion occurs. | |
(define (ins t) | |
(match t | |
['empty `(red ,i empty empty)] | |
[`(,color ,v ,t0 ,t1) | |
(cond [(equal? v i) t] ;; already present, just return t back | |
;; insert into the left subtree and | |
;; possibly rebalance | |
[(< i v) (balance color v (ins t0) t1)] | |
;; insert into the right subtree and | |
;; possibly rebalance | |
[(> i v) (balance color v t0 (ins t1))])])) | |
;; finally: where we actually make things happen | |
;; insert the node, and then fix any red/red | |
;; conflicts along the insertion path | |
(let ([inserted-mostly-fixed-tree (ins t)]) | |
;; now color the top node black (to avoid | |
;; a red/red conflict at the top)--this is | |
;; safe because the left/right subtree have | |
;; the same black height, which we extend | |
;; by 1. | |
(recolor-root-black inserted-mostly-fixed-tree))) | |
(define (rb-insert-multiple ls rbt) | |
(match ls | |
['() rbt] ;; just return the starting bst | |
[`(,fst . ,tl) ;; insert fst followed by tl (the rest) | |
(let ([after-inserting-fst (rbtree-insert rbt fst)]) | |
;; call insert-multiple to do the rest of the work | |
(rb-insert-multiple tl after-inserting-fst))])) | |
(displayln "starting to insert 10k") | |
(displayln "starting timer") | |
;; DEMO: RB Tree insertion / lookup | |
(time (let ([after-inserting-10k (rb-insert-multiple (range 1 10000) 'empty)]) | |
(displayln "inserted 10k") | |
(displayln "we hope we can find everything (#t if so):") | |
(andmap (λ (x) (rbtree-member? after-inserting-10k x)) (range 1 10000)))) | |
(rb-insert-multiple (range 1 20) 'empty) | |
;; Exercise (hard). Read these notes and then implement | |
;; red/black tree deletion: | |
;; - https://matt.might.net/articles/red-black-delete/ | |
;; - https://abhiroop.github.io/Haskell-Red-Black-Tree/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment