Created
July 16, 2023 06:06
-
-
Save usametov/3c46a8012d5913e53f2810b28d0c5af6 to your computer and use it in GitHub Desktop.
clojure implementation of LangChain's recursive splitter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns astanova.recursive-splitter | |
"implementation of LangChain's recursive code splitter" | |
(:require [clojure.string :as s])) | |
(defonce java-splitters [#"class " #"public " #"protected " | |
#"private " #"static " #"if" | |
#"for" #"while" #"switch" | |
#"case" #"\r\n" #"\t\t"]) | |
(defonce js-splitters [#"function " #"const " #"let " | |
#"var " #"class " #"if" #"for" | |
#"while" #"switch" #"case" #"default "]) | |
(defonce py-splitters [#"class " #"def " #"\n\tdef " #"\n\n"]) | |
(defonce cpp-splitters [ #"class " #"void " #"int " #"float " #"double " #"if" | |
#"for" #"while" #"switch" #"case" #"\n\n"]) | |
(defonce chunk-size 400) | |
(defn build-splitter | |
[regex] | |
(fn[txt] | |
(if (< chunk-size (count txt)) | |
(s/split txt regex) | |
[txt]))) | |
(defn split-step | |
[{:keys [txt-seq splitters]}] | |
(if (< 0 (count splitters)) | |
{:txt-seq (map s/trim | |
(filter (complement s/blank?) | |
(mapcat (first splitters) txt-seq))) | |
:splitters (rest splitters)})) | |
(defn recursive-split | |
[code-txt regex-separators] | |
(let [splitters (map build-splitter regex-separators)] | |
(:txt-seq | |
(last | |
(take-while some? | |
(iterate split-step {:txt-seq [code-txt] :splitters splitters})))))) | |
(comment | |
(def java-code | |
(slurp "https://raw.githubusercontent.com/Convex-Dev/convex/develop/convex-cli/src/main/java/convex/cli/AccountBalance.java")) | |
(recursive-split java-code java-splitters) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment