Created
August 22, 2021 18:20
-
-
Save breandan/384bbb6a12d922f686a925eb25941525 to your computer and use it in GitHub Desktop.
Method-level chunks using Dyck-slicing.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
override fun dimensions(): Int = embedding.size | |
============================== | |
override fun toString() = loc.getContext(0) | |
} | |
============================== | |
fun buildOrLoadVecIndex( | |
indexFile: File = File(DEFAULT_KNNINDEX_FILENAME), | |
rootDir: URI = DATA_DIR | |
): VecIndex = | |
if (!indexFile.exists()) rebuildVecIndex(indexFile, rootDir) | |
else indexFile.also { println("Loading index from ${it.absolutePath}") } | |
.deserializeFrom() | |
============================== | |
fun VecIndex.exactKNNSearch(vq: DoubleArray, nearestNeighbors: Int) = | |
asExactIndex().findNearest(vq, nearestNeighbors) | |
============================== | |
fun main() { | |
buildOrLoadKWIndex( | |
indexFile = File(DEFAULT_KWINDEX_FILENAME), | |
rootDir = File("data").toURI() | |
) | |
} | |
============================== | |
override fun vector(): DoubleArray = embedding | |
============================== | |
fun KWIndex.search(query: String): List<Concordance> = | |
getValuesForKeysContaining(query).flatten() | |
============================== | |
tailrec fun VecIndex.edges( | |
seed: String? = null, | |
queries: List<String> = if (seed == null) emptyList() else listOf(seed), | |
depth: Int = 10, | |
width: Int = 5, | |
edges: List<Pair<String, String>> = emptyList(), | |
): List<Pair<String, String>> = | |
if (queries.isEmpty() || depth == 0) edges | |
else { | |
val query = seed ?: queries.first() | |
val nearestResults = findNearest(vectorize(query), 100) | |
.map { it.item().loc.getContext(0) } | |
.filter { it.isNotEmpty() && it != query } | |
.take(width) | |
============================== | |
fun KWIndex.indexLine(line: String, location: Concordance) { | |
ConcurrentLinkedQueue(listOf(location)).let { | |
line.split(DELIMITER).filter { it.isNotBlank() } | |
.forEach { token -> putIfAbsent(token, it)?.offer(it.first()) } | |
} | |
} | |
============================== | |
override fun id(): Concordance = loc | |
============================== | |
fun buildOrLoadKWIndex( | |
indexFile: File = File(DEFAULT_KNNINDEX_FILENAME), | |
rootDir: URI = TEST_DIR | |
): KWIndex = | |
if (!indexFile.exists()) | |
rebuildKWIndex(rootDir).apply { serializeTo(indexFile) } | |
else indexFile.deserializeFrom() | |
============================== | |
fun main() { | |
buildOrLoadVecIndex() | |
} | |
============================== | |
fun Array<DoubleArray>.reduceDim( | |
outputDims: Int = 2, | |
perplexity: Double = 10.0, | |
tSne: TSne = ParallelBHTsne() | |
): Array<out DoubleArray> = | |
tSne.tsne(TSneUtils.buildConfig(this, outputDims, size - 1, perplexity, 99999)) | |
============================== | |
fun main() { | |
val (labels, vectors) = fetchOrLoadSampleData() | |
val d2vecs = vectors.reduceDim() | |
labels.forEachIndexed { i, l -> | |
println("${l.length},${d2vecs[i][0]},${d2vecs[i][1]}") | |
} | |
val plot = plotTsneEmbeddingsWithLabels(d2vecs, labels.map { it.length.toString() } ) | |
File.createTempFile("clusters", ".html") | |
.apply { writeText("<html>$plot</html>") }.show() | |
} | |
============================== | |
operator fun get(hashCode: Int) = hashtable[hashCode] ?: URI("MISSING") | |
============================== | |
fun KWIndex.search(vararg keywords: String) = | |
keywords.map { search(it) }.reduce { acc, results -> | |
val common = results.map { it.uri }.intersect(results.map { it.uri }) | |
(acc + results).filter { it.uri in common } | |
}.groupBy { it.uri } | |
============================== | |
fun rebuildKWIndex(rootDir: URI): KWIndex = | |
measureTimedValue { | |
println("Rebuilding keyword index...") | |
KWIndex(DefaultCharArrayNodeFactory()).apply { | |
rootDir.allFilesRecursively().toList().parallelStream().forEach { src -> | |
indexURI(src) { line, location -> indexLine(line, location) } | |
println("Finished indexing $src") | |
} | |
} | |
}.let { println("Built keyword index in ${it.duration}"); it.value } | |
============================== | |
fun rebuildVecIndex(indexFile: File, origin: URI): VecIndex = | |
HnswIndex.newBuilder(BERT_EMBEDDING_SIZE, EMD, 1000000) | |
.withM(100).withEf(500).withEfConstruction(500) | |
.build<Concordance, CodeEmbedding>().also { idx -> | |
println("Rebuilding vector index...") | |
measureTimedValue { | |
// TODO: Does parallelization really help on single-GPU machine? | |
origin.allFilesRecursively().toList().parallelStream().forEach { src -> | |
// var last = "" | |
indexURI(src) { line, loc -> | |
try { | |
// if (loc.uri.suffix() != last) | |
// last = loc.uri.suffix().also { println(loc.fileSummary()) } | |
idx.add(CodeEmbedding(loc, vectorize(line))) | |
} catch (exception: Exception) { | |
} | |
} | |
} | |
}.let { println("Rebuilt vector index in ${it.duration.inWholeMinutes} minutes") } | |
}.also { it.serializeTo(indexFile) } | |
============================== | |
fun VecIndex.knn(v: DoubleArray, i: Int, exact: Boolean = false) = | |
if(exact) exactKNNSearch(v, i + 10) | |
else findNearest(v, i + 10) | |
.filter { !it.item().embedding.contentEquals(v) } | |
.distinctBy { it.item().toString() }.take(i) | |
============================== | |
fun VecIndex.knn(query: String, k: Int = 10) = knn(vectorize(query), k) | |
============================== |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment