Skip to content

Instantly share code, notes, and snippets.

@breandan
Created August 22, 2021 18:20
Show Gist options
  • Save breandan/384bbb6a12d922f686a925eb25941525 to your computer and use it in GitHub Desktop.
Save breandan/384bbb6a12d922f686a925eb25941525 to your computer and use it in GitHub Desktop.
Method-level chunks using Dyck-slicing.
override fun dimensions(): Int = embedding.size
==============================
override fun toString() = loc.getContext(0)
}
==============================
fun buildOrLoadVecIndex(
indexFile: File = File(DEFAULT_KNNINDEX_FILENAME),
rootDir: URI = DATA_DIR
): VecIndex =
if (!indexFile.exists()) rebuildVecIndex(indexFile, rootDir)
else indexFile.also { println("Loading index from ${it.absolutePath}") }
.deserializeFrom()
==============================
fun VecIndex.exactKNNSearch(vq: DoubleArray, nearestNeighbors: Int) =
asExactIndex().findNearest(vq, nearestNeighbors)
==============================
fun main() {
buildOrLoadKWIndex(
indexFile = File(DEFAULT_KWINDEX_FILENAME),
rootDir = File("data").toURI()
)
}
==============================
override fun vector(): DoubleArray = embedding
==============================
fun KWIndex.search(query: String): List<Concordance> =
getValuesForKeysContaining(query).flatten()
==============================
tailrec fun VecIndex.edges(
seed: String? = null,
queries: List<String> = if (seed == null) emptyList() else listOf(seed),
depth: Int = 10,
width: Int = 5,
edges: List<Pair<String, String>> = emptyList(),
): List<Pair<String, String>> =
if (queries.isEmpty() || depth == 0) edges
else {
val query = seed ?: queries.first()
val nearestResults = findNearest(vectorize(query), 100)
.map { it.item().loc.getContext(0) }
.filter { it.isNotEmpty() && it != query }
.take(width)
==============================
fun KWIndex.indexLine(line: String, location: Concordance) {
ConcurrentLinkedQueue(listOf(location)).let {
line.split(DELIMITER).filter { it.isNotBlank() }
.forEach { token -> putIfAbsent(token, it)?.offer(it.first()) }
}
}
==============================
override fun id(): Concordance = loc
==============================
fun buildOrLoadKWIndex(
indexFile: File = File(DEFAULT_KNNINDEX_FILENAME),
rootDir: URI = TEST_DIR
): KWIndex =
if (!indexFile.exists())
rebuildKWIndex(rootDir).apply { serializeTo(indexFile) }
else indexFile.deserializeFrom()
==============================
fun main() {
buildOrLoadVecIndex()
}
==============================
fun Array<DoubleArray>.reduceDim(
outputDims: Int = 2,
perplexity: Double = 10.0,
tSne: TSne = ParallelBHTsne()
): Array<out DoubleArray> =
tSne.tsne(TSneUtils.buildConfig(this, outputDims, size - 1, perplexity, 99999))
==============================
fun main() {
val (labels, vectors) = fetchOrLoadSampleData()
val d2vecs = vectors.reduceDim()
labels.forEachIndexed { i, l ->
println("${l.length},${d2vecs[i][0]},${d2vecs[i][1]}")
}
val plot = plotTsneEmbeddingsWithLabels(d2vecs, labels.map { it.length.toString() } )
File.createTempFile("clusters", ".html")
.apply { writeText("<html>$plot</html>") }.show()
}
==============================
operator fun get(hashCode: Int) = hashtable[hashCode] ?: URI("MISSING")
==============================
fun KWIndex.search(vararg keywords: String) =
keywords.map { search(it) }.reduce { acc, results ->
val common = results.map { it.uri }.intersect(results.map { it.uri })
(acc + results).filter { it.uri in common }
}.groupBy { it.uri }
==============================
fun rebuildKWIndex(rootDir: URI): KWIndex =
measureTimedValue {
println("Rebuilding keyword index...")
KWIndex(DefaultCharArrayNodeFactory()).apply {
rootDir.allFilesRecursively().toList().parallelStream().forEach { src ->
indexURI(src) { line, location -> indexLine(line, location) }
println("Finished indexing $src")
}
}
}.let { println("Built keyword index in ${it.duration}"); it.value }
==============================
fun rebuildVecIndex(indexFile: File, origin: URI): VecIndex =
HnswIndex.newBuilder(BERT_EMBEDDING_SIZE, EMD, 1000000)
.withM(100).withEf(500).withEfConstruction(500)
.build<Concordance, CodeEmbedding>().also { idx ->
println("Rebuilding vector index...")
measureTimedValue {
// TODO: Does parallelization really help on single-GPU machine?
origin.allFilesRecursively().toList().parallelStream().forEach { src ->
// var last = ""
indexURI(src) { line, loc ->
try {
// if (loc.uri.suffix() != last)
// last = loc.uri.suffix().also { println(loc.fileSummary()) }
idx.add(CodeEmbedding(loc, vectorize(line)))
} catch (exception: Exception) {
}
}
}
}.let { println("Rebuilt vector index in ${it.duration.inWholeMinutes} minutes") }
}.also { it.serializeTo(indexFile) }
==============================
fun VecIndex.knn(v: DoubleArray, i: Int, exact: Boolean = false) =
if(exact) exactKNNSearch(v, i + 10)
else findNearest(v, i + 10)
.filter { !it.item().embedding.contentEquals(v) }
.distinctBy { it.item().toString() }.take(i)
==============================
fun VecIndex.knn(query: String, k: Int = 10) = knn(vectorize(query), k)
==============================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment