patrick-nicodemus · March 16, 2026 16:11
diff --git a/stock_file_merge.ml b/stock_file_merge.ml
 module LeftistHeap(O : Map.OrderedType) : sig
  type t (* The type of heaps. *)
  type elt = O.t

  val empty : t
  val push : elt -> t -> t

  (** Raises "Not_found" when empty. *)
  val pop : t -> elt * t

  (** Raises "Not_found" when empty. *)
  val peek : t -> elt
 end = struct
  [@@@warning "-32"]
  
  (* Some definitions. *)
  type t =
    | Node of t * int * O.t * t (* Left child, rank, value, right child *)
    | Empty

  type elt = O.t
  (*
     A binary tree is said to be min-heap-ordered if, for each node,
     the value at that node is less than or equal to the value at both of its children.
     Thus, following a path from root to leaf, values in the tree are monotonically increasing.
   *)

  let is_min_heap : t -> bool =
    let rec helper (parent_val : elt) : t -> bool =
      function
      | Node(left, _, value, right) ->
        O.compare parent_val value <= 0 &&
        helper value left &&
        helper value right
      | Empty -> true
    in 
    function 
    | Node(left, _, value, right) ->
      helper value left && helper value right
    | Empty -> true

  (*
     Define the *right spine* of a binary tree to be the path from the root to the rightmost child.
     Define the *rank* of a node in a binary tree to be the length of the right spine.
  *)

  let rank =
    let rec rank_helper (accumulator : int) : t -> int = 
      function
      | Node ( _, _, _, right) -> rank_helper (accumulator + 1) right
      | Empty -> 0
    in
    rank_helper 0

  (* A binary tree is *leftist*
     if, for each node, the rank of its left child is greater than or equal to the rank of its right child.
  *)
  type is_leftist_helper =
        | NotLeftist
        | Leftist of int (* rank *)

  let is_leftist : t -> bool =
    let rec rank_helper : t -> is_leftist_helper =
      function
      | Node(left, _, _, right) ->
        begin
          match rank_helper right with
          | NotLeftist -> NotLeftist
          | Leftist rank ->
            begin
              match rank_helper left with
              | NotLeftist -> NotLeftist
              | Leftist _ -> Leftist(rank + 1)
            end
        end
      | Empty -> Leftist 0
    in
    fun heap -> match rank_helper heap with
      | NotLeftist -> false
      | Leftist _ -> true


  (*
     Lemma:
     In a leftist binary tree, the length of the right spine is less than or equal to the length of any other path from the root to a leaf.

     Proof:
     Let us induct on the tree.
     - If the tree is empty, it is trivial.
     - If the tree is Node(left, value, right), and the claim is assumed to be true for the left and right subtrees,
       then consider an arbitrary path L :: R :: L :: R :: R ... from root to leaf.
       - If the path is of the form (L :: p), then length(p) >= rank(left) by induction, so length(L :: p) = 1 + length(p) >= 1 + rank(left) >= 1 + rank(right) = rank(root).
       - If the path is of the form (R :: p) then length(R :: p) = 1 + length(p) >= 1 + rank(right) = rank(root).
       
     Corollary:
     A leftist tree with a right spine of length n has at least 2^n - 1 nodes. Conversely, if a leftist tree has N nodes, the length of the right spine is at most log_2(N).

     A leftist tree may be completely unbalanced on the left, for example, it may degenerate to a linked list on the left.
     This is fine because the operations we will consider are all contingent upon the length of the right spine.

     Note that leftist trees do not necessarily have monotonically increasing path lengths from left to right. For example, the following tree is a leftist heap.

       0
      / \
     1   2
        /
       3
  *)

  (* We implement "merge" first, because both push and pop can be easily implemented in terms of merge.

     - Pushing an element [e] into a heap [h] is equivalent to merging the heap [h] with a singleton heap [Node(Empty,1,e, Empty)].
     - Deleting an element from the heap is straightforwardly equivalent to the problem of merging the left and right subtrees into a single heap.

     The idea behind the algorithm is as follows.

     A binary tree with structure

             v0
           /   \
        (l0)    v1
               /  \
            (l1)   v2
                  /
                (l2)

     where v0, v1, v2 are values and l1, l2, l3 are subtrees,
     can be thought of as a list of ordered pairs
     [(v0, l0), (v1,l1), (v2, l2)].

     If two heaps are merged in this way, *as lists*,
     using the ordinary list mergesort, compared by
     the value v,
     the heap ordering property is preserved.

     For example, if we merge
     [(v0, l0), (v1, l1)] with [(v0', l0'), (v1', l1')], then
     - the heap property of each of l0, l1, l0', l1' is
       preserved because none of these are modified
     - the fact that v_i <= root(l_i) and v_i' <= root(l_i')
       is preserved, because the pairs aren't modified
     - the fact that the list is sorted by the keys v_i, v_i'
       guarantees that every element is greater than its right child
       along the right spine.

     The other thing that needs to be addressed is maintaining
     the leftist property, which preserves the balancing.

     The basic observation here is that if [t1] is a leftist tree of rank [r1],
     and [t2] is a leftist tree of rank [r2], then for some value [v],

     [if r1 <= r2 then Node(t1, v, t2) else Node(t2, v, t1)]

     is obviously a leftist tree of rank min(r1,r2) + 1.

     Thus, we can ensure that the tree is leftist on the way "back up"
     after merging, by swapping the child trees to ensure the leftist property.
     This is the reason it is necessary here to store the rank of each node.
     
  *)

  (** Assumption:
      [subtree1], [subtree2] are leftist heaps,
      and [value] is less than or equal to the
      root values of [subtree1] and [subtree2] when these exist.

      Returns a leftist heap with root [value] and [subtree1], [subtree2] as subtrees.
  *)
  let create (value : elt) (subtree1 : t) (subtree2 : t) =
    match subtree1 with
    | Empty -> Node(subtree2, 1, value, Empty)
    | Node(_, rank1, _, _) as h1 ->
      begin
        match subtree2 with
        | Empty -> Node(h1, 1, value, Empty)
        | Node(_, rank2, _, _) as h2 ->
          if rank1 <= rank2 then
            Node(h2, rank1 + 1, value, h1)
          else
            Node(h1, rank1 + 1, value, h2)
      end

  (* Inductive hypothesis: h1, h2 are leftist heaps. *)
  let rec merge (h1 : t) (h2 : t) : t =
    match h1, h2 with
      | Empty, _ -> h2
      | _, Empty -> h1
      | Node(l1, _, val1, r1) as h1, (Node(l2, _, val2, r2) as h2) ->
        begin
          (* Sort the list by the *values* of the roots; val1 is the "head" of list1, and val2 is the "head" of list2 *)
          if O.compare val1 val2 <= 0 then
            let merged_subtree = merge r1 h2 in
            (* Node(l1, val1, merged_subtree) is heap-ordered but may not be leftist, so call [create]*)
            create val1 l1 merged_subtree
          else
            let merged_subtree = merge h1 r2 in
            (* Node(l2, val2, merged_subtree) is heap ordered but may not be leftist. *)
            create val2 l2 merged_subtree
        end

  let singleton x = Node(Empty, 1, x, Empty)

  let pop (heap : t) : elt * t =
    match heap with
    | Empty -> raise Not_found
    | Node(left, _, value, right) -> (value, merge left right)

  let rec push (x : elt) (heap : t) : t =
    match heap with
    | Empty -> singleton x
    | Node(left, _, value, right) ->
      if O.compare x value <= 0 then Node(heap, 1, x, Empty)
      else
        create value left (push x right)

  let push (x : elt) (heap : t) : t =
    merge (singleton x) heap

  let empty = Empty
  let peek = function
    | Node(_, _, value, _) -> value
    | Empty -> raise Not_found
 end


 (**
    Cool application:
    Using a heap, we can merge an array of sequences
    into a single sequence.
 *)
 module MergeSequences(O : Map.OrderedType) = struct
  module Heap = LeftistHeap(struct
      type t = O.t * O.t Seq.t
      let compare : t -> t -> int = fun (i, _) (j, _) -> O.compare i j
    end)

  (* Merge an array of sorted sequences/ lazy streams into a single sequence/lazy stream. *)
  let merge : O.t Seq.t array -> O.t Seq.t =
    fun arr ->
    let initial_heap = Array.fold_left (fun current_heap input_sequence ->
        match input_sequence () with
        | Seq.Cons(head, tail) -> Heap.push (head, tail) current_heap
        | Seq.Nil -> current_heap
      ) Heap.empty arr
    in
    let rec seq current_heap () =
      match Heap.pop current_heap with
      | exception Not_found -> Seq.Nil
      | ((head, tail), rest) ->
        Seq.Cons(head, seq @@ match tail () with
          | Seq.Nil -> rest
          | Seq.Cons(hd2, tl2) -> Heap.push (hd2, tl2) rest
          )
    in
    seq initial_heap
 end

 (* We test the merge function
   with a textbook example:
   you are given a directory containing 500 text files,
   each of which contains the trading history for a certain stock in the S&P 500,
   all on the same date.

   Each line of the file represents one trade, and contains
   the time of the trade, the price of the trade, and the volume of the trade,
   separated by whitespace. The lines of each file are already ordered
   chronologically, in format:

   Time Price Volume

   e.g.,
   9:00:05.06 86.51 16.324748  
   
   The exercise is to merge all the files together into a single file
   containing all trades from all 500 files in chronological order, with lines in format
   
   Time Ticker Price Volume

   You must assume that each file is too large to load into memory,
   and thus you are forced to use a lazy stream to read the results and
   append to the end of the current file.
 *)


 (* Let's say the file names are just "MSFT, AAPL", and so on. *)
 let filenames stock_directory = Sys.readdir stock_directory |> Array.map (Filename.concat stock_directory)

 type trade = {
  time : float; (* Time in seconds since 12:00am *)
  ticker : string;
  price : float;
  volume : float
 }

 let compare trade1 trade2 = Float.compare trade1.time trade2.time

 let read_file (filename : string) : trade Seq.t = 
  let stem = Filename.basename filename in (* stem is just "MSFT", "AAPL", etc. *)
  let input = Scanf.Scanning.open_in filename in
  let rec seq = fun () ->
    try
      Seq.Cons(Scanf.bscanf input "%d:%d:%f %f %f\n" (fun hh mm ss price volume -> { time = Float.of_int (3600 * hh + 60 * mm) +. ss; ticker = stem; price; volume }), seq)
    with
    | End_of_file -> Seq.Nil
  in
  seq
  
 let get_all_trades stock_directory = Array.map read_file (filenames stock_directory)

 module MergeTrades = MergeSequences(struct type t = trade let compare = compare end)

 let time_to_triple : float -> int * int * float =
  fun time ->
  let hh = Float.to_int (time /. (3600.)) in
  let mm = Float.to_int ((time -. (Float.of_int (3600*hh))) /. (60.)) in
  let ss = (time -. (Float.of_int (3600*hh + 60*mm))) in
  (hh,mm,ss)

 let run_on_directory stock_directory output_file =
  let output_stream = MergeTrades.merge (get_all_trades stock_directory) in
  let output_file = Out_channel.open_text output_file in 
  Seq.iter (fun trade ->
      let (hh, mm, ss) = time_to_triple trade.time in 
      Printf.fprintf output_file "%2d:%02d:%05.2f %s %f %f\n" hh mm ss trade.ticker trade.price trade.volume
    ) output_stream;
  Out_channel.close output_file

 (*

 9:00:05.06 BRK.B 86.511012 16.324748
 9:00:05.38 JPM 129.787305 12.971176
 9:00:06.35 TSLA 85.950765 83.917248
 9:00:08.24 META 128.722486 84.949220
 9:00:12.92 TSLA 87.567947 12.716145
 9:00:13.06 AMZN 105.727577 96.270682
 9:00:13.27 META 134.866624 67.115740
 9:00:14.56 GOOGL 129.590207 20.087959
 9:00:16.39 META 136.334271 73.743344
 9:00:18.61 MSFT 93.187456 73.665966
 9:00:19.93 BRK.B 83.910280 66.070099
 9:00:20.77 JPM 131.393396 94.620016
 9:00:21.62 AMZN 109.068694 5.724483
 9:00:23.22 GOOGL 126.529225 89.509342
 9:00:23.75 AVGO 107.377276 60.652863
 9:00:24.39 AVGO 107.988566 15.997441
 9:00:26.30 MSFT 92.013919 88.819089
 9:00:27.01 NVDA 80.910522 63.766271

 *)
	module LeftistHeap(O : Map.OrderedType) : sig
	type t (* The type of heaps. *)
	type elt = O.t

	val empty : t
	val push : elt -> t -> t

	(** Raises "Not_found" when empty. *)
	val pop : t -> elt * t

	(** Raises "Not_found" when empty. *)
	val peek : t -> elt
	end = struct
	[@@@warning "-32"]

	(* Some definitions. *)
	type t =
	\| Node of t * int * O.t * t (* Left child, rank, value, right child *)
	\| Empty

	type elt = O.t
	(*
	A binary tree is said to be min-heap-ordered if, for each node,
	the value at that node is less than or equal to the value at both of its children.
	Thus, following a path from root to leaf, values in the tree are monotonically increasing.
	*)

	let is_min_heap : t -> bool =
	let rec helper (parent_val : elt) : t -> bool =
	function
	\| Node(left, _, value, right) ->
	O.compare parent_val value <= 0 &&
	helper value left &&
	helper value right
	\| Empty -> true
	in
	function
	\| Node(left, _, value, right) ->
	helper value left && helper value right
	\| Empty -> true

	(*
	Define the right spine of a binary tree to be the path from the root to the rightmost child.
	Define the rank of a node in a binary tree to be the length of the right spine.
	*)

	let rank =
	let rec rank_helper (accumulator : int) : t -> int =
	function
	\| Node ( _, _, _, right) -> rank_helper (accumulator + 1) right
	\| Empty -> 0
	in
	rank_helper 0

	(* A binary tree is leftist
	if, for each node, the rank of its left child is greater than or equal to the rank of its right child.
	*)
	type is_leftist_helper =
	\| NotLeftist
	\| Leftist of int (* rank *)

	let is_leftist : t -> bool =
	let rec rank_helper : t -> is_leftist_helper =
	function
	\| Node(left, _, _, right) ->
	begin
	match rank_helper right with
	\| NotLeftist -> NotLeftist
	\| Leftist rank ->
	begin
	match rank_helper left with
	\| NotLeftist -> NotLeftist
	\| Leftist _ -> Leftist(rank + 1)
	end
	end
	\| Empty -> Leftist 0
	in
	fun heap -> match rank_helper heap with
	\| NotLeftist -> false
	\| Leftist _ -> true


	(*
	Lemma:
	In a leftist binary tree, the length of the right spine is less than or equal to the length of any other path from the root to a leaf.

	Proof:
	Let us induct on the tree.
	- If the tree is empty, it is trivial.
	- If the tree is Node(left, value, right), and the claim is assumed to be true for the left and right subtrees,
	then consider an arbitrary path L :: R :: L :: R :: R ... from root to leaf.
	- If the path is of the form (L :: p), then length(p) >= rank(left) by induction, so length(L :: p) = 1 + length(p) >= 1 + rank(left) >= 1 + rank(right) = rank(root).
	- If the path is of the form (R :: p) then length(R :: p) = 1 + length(p) >= 1 + rank(right) = rank(root).

	Corollary:
	A leftist tree with a right spine of length n has at least 2^n - 1 nodes. Conversely, if a leftist tree has N nodes, the length of the right spine is at most log_2(N).

	A leftist tree may be completely unbalanced on the left, for example, it may degenerate to a linked list on the left.
	This is fine because the operations we will consider are all contingent upon the length of the right spine.

	Note that leftist trees do not necessarily have monotonically increasing path lengths from left to right. For example, the following tree is a leftist heap.

	0
	/ \
	1 2
	/
	3
	*)

	(* We implement "merge" first, because both push and pop can be easily implemented in terms of merge.

	- Pushing an element [e] into a heap [h] is equivalent to merging the heap [h] with a singleton heap [Node(Empty,1,e, Empty)].
	- Deleting an element from the heap is straightforwardly equivalent to the problem of merging the left and right subtrees into a single heap.

	The idea behind the algorithm is as follows.

	A binary tree with structure

	v0
	/ \
	(l0) v1
	/ \
	(l1) v2
	/
	(l2)

	where v0, v1, v2 are values and l1, l2, l3 are subtrees,
	can be thought of as a list of ordered pairs
	[(v0, l0), (v1,l1), (v2, l2)].

	If two heaps are merged in this way, as lists,
	using the ordinary list mergesort, compared by
	the value v,
	the heap ordering property is preserved.

	For example, if we merge
	[(v0, l0), (v1, l1)] with [(v0', l0'), (v1', l1')], then
	- the heap property of each of l0, l1, l0', l1' is
	preserved because none of these are modified
	- the fact that v_i <= root(l_i) and v_i' <= root(l_i')
	is preserved, because the pairs aren't modified
	- the fact that the list is sorted by the keys v_i, v_i'
	guarantees that every element is greater than its right child
	along the right spine.

	The other thing that needs to be addressed is maintaining
	the leftist property, which preserves the balancing.

	The basic observation here is that if [t1] is a leftist tree of rank [r1],
	and [t2] is a leftist tree of rank [r2], then for some value [v],

	[if r1 <= r2 then Node(t1, v, t2) else Node(t2, v, t1)]

	is obviously a leftist tree of rank min(r1,r2) + 1.

	Thus, we can ensure that the tree is leftist on the way "back up"
	after merging, by swapping the child trees to ensure the leftist property.
	This is the reason it is necessary here to store the rank of each node.

	*)

	(** Assumption:
	[subtree1], [subtree2] are leftist heaps,
	and [value] is less than or equal to the
	root values of [subtree1] and [subtree2] when these exist.

	Returns a leftist heap with root [value] and [subtree1], [subtree2] as subtrees.
	*)
	let create (value : elt) (subtree1 : t) (subtree2 : t) =
	match subtree1 with
	\| Empty -> Node(subtree2, 1, value, Empty)
	\| Node(_, rank1, _, _) as h1 ->
	begin
	match subtree2 with
	\| Empty -> Node(h1, 1, value, Empty)
	\| Node(_, rank2, _, _) as h2 ->
	if rank1 <= rank2 then
	Node(h2, rank1 + 1, value, h1)
	else
	Node(h1, rank1 + 1, value, h2)
	end

	(* Inductive hypothesis: h1, h2 are leftist heaps. *)
	let rec merge (h1 : t) (h2 : t) : t =
	match h1, h2 with
	\| Empty, _ -> h2
	\| _, Empty -> h1
	\| Node(l1, _, val1, r1) as h1, (Node(l2, _, val2, r2) as h2) ->
	begin
	(* Sort the list by the values of the roots; val1 is the "head" of list1, and val2 is the "head" of list2 *)
	if O.compare val1 val2 <= 0 then
	let merged_subtree = merge r1 h2 in
	(* Node(l1, val1, merged_subtree) is heap-ordered but may not be leftist, so call [create]*)
	create val1 l1 merged_subtree
	else
	let merged_subtree = merge h1 r2 in
	(* Node(l2, val2, merged_subtree) is heap ordered but may not be leftist. *)
	create val2 l2 merged_subtree
	end

	let singleton x = Node(Empty, 1, x, Empty)

	let pop (heap : t) : elt * t =
	match heap with
	\| Empty -> raise Not_found
	\| Node(left, _, value, right) -> (value, merge left right)

	let rec push (x : elt) (heap : t) : t =
	match heap with
	\| Empty -> singleton x
	\| Node(left, _, value, right) ->
	if O.compare x value <= 0 then Node(heap, 1, x, Empty)
	else
	create value left (push x right)

	let push (x : elt) (heap : t) : t =
	merge (singleton x) heap

	let empty = Empty
	let peek = function
	\| Node(_, _, value, _) -> value
	\| Empty -> raise Not_found
	end


	(**
	Cool application:
	Using a heap, we can merge an array of sequences
	into a single sequence.
	*)
	module MergeSequences(O : Map.OrderedType) = struct
	module Heap = LeftistHeap(struct
	type t = O.t * O.t Seq.t
	let compare : t -> t -> int = fun (i, _) (j, _) -> O.compare i j
	end)

	(* Merge an array of sorted sequences/ lazy streams into a single sequence/lazy stream. *)
	let merge : O.t Seq.t array -> O.t Seq.t =
	fun arr ->
	let initial_heap = Array.fold_left (fun current_heap input_sequence ->
	match input_sequence () with
	\| Seq.Cons(head, tail) -> Heap.push (head, tail) current_heap
	\| Seq.Nil -> current_heap
	) Heap.empty arr
	in
	let rec seq current_heap () =
	match Heap.pop current_heap with
	\| exception Not_found -> Seq.Nil
	\| ((head, tail), rest) ->
	Seq.Cons(head, seq @@ match tail () with
	\| Seq.Nil -> rest
	\| Seq.Cons(hd2, tl2) -> Heap.push (hd2, tl2) rest
	)
	in
	seq initial_heap
	end

	(* We test the merge function
	with a textbook example:
	you are given a directory containing 500 text files,
	each of which contains the trading history for a certain stock in the S&P 500,
	all on the same date.

	Each line of the file represents one trade, and contains
	the time of the trade, the price of the trade, and the volume of the trade,
	separated by whitespace. The lines of each file are already ordered
	chronologically, in format:

	Time Price Volume

	e.g.,
	9:00:05.06 86.51 16.324748

	The exercise is to merge all the files together into a single file
	containing all trades from all 500 files in chronological order, with lines in format

	Time Ticker Price Volume

	You must assume that each file is too large to load into memory,
	and thus you are forced to use a lazy stream to read the results and
	append to the end of the current file.
	*)


	(* Let's say the file names are just "MSFT, AAPL", and so on. *)
	let filenames stock_directory = Sys.readdir stock_directory \|> Array.map (Filename.concat stock_directory)

	type trade = {
	time : float; (* Time in seconds since 12:00am *)
	ticker : string;
	price : float;
	volume : float
	}

	let compare trade1 trade2 = Float.compare trade1.time trade2.time

	let read_file (filename : string) : trade Seq.t =
	let stem = Filename.basename filename in (* stem is just "MSFT", "AAPL", etc. *)
	let input = Scanf.Scanning.open_in filename in
	let rec seq = fun () ->
	try
	Seq.Cons(Scanf.bscanf input "%d:%d:%f %f %f\n" (fun hh mm ss price volume -> { time = Float.of_int (3600 * hh + 60 * mm) +. ss; ticker = stem; price; volume }), seq)
	with
	\| End_of_file -> Seq.Nil
	in
	seq

	let get_all_trades stock_directory = Array.map read_file (filenames stock_directory)

	module MergeTrades = MergeSequences(struct type t = trade let compare = compare end)

	let time_to_triple : float -> int * int * float =
	fun time ->
	let hh = Float.to_int (time /. (3600.)) in
	let mm = Float.to_int ((time -. (Float.of_int (3600*hh))) /. (60.)) in
	let ss = (time -. (Float.of_int (3600hh + 60mm))) in
	(hh,mm,ss)

	let run_on_directory stock_directory output_file =
	let output_stream = MergeTrades.merge (get_all_trades stock_directory) in
	let output_file = Out_channel.open_text output_file in
	Seq.iter (fun trade ->
	let (hh, mm, ss) = time_to_triple trade.time in
	Printf.fprintf output_file "%2d:%02d:%05.2f %s %f %f\n" hh mm ss trade.ticker trade.price trade.volume
	) output_stream;
	Out_channel.close output_file

	(*

	9:00:05.06 BRK.B 86.511012 16.324748
	9:00:05.38 JPM 129.787305 12.971176
	9:00:06.35 TSLA 85.950765 83.917248
	9:00:08.24 META 128.722486 84.949220
	9:00:12.92 TSLA 87.567947 12.716145
	9:00:13.06 AMZN 105.727577 96.270682
	9:00:13.27 META 134.866624 67.115740
	9:00:14.56 GOOGL 129.590207 20.087959
	9:00:16.39 META 136.334271 73.743344
	9:00:18.61 MSFT 93.187456 73.665966
	9:00:19.93 BRK.B 83.910280 66.070099
	9:00:20.77 JPM 131.393396 94.620016
	9:00:21.62 AMZN 109.068694 5.724483
	9:00:23.22 GOOGL 126.529225 89.509342
	9:00:23.75 AVGO 107.377276 60.652863
	9:00:24.39 AVGO 107.988566 15.997441
	9:00:26.30 MSFT 92.013919 88.819089
	9:00:27.01 NVDA 80.910522 63.766271

	*)
No results found