Skip to content

Instantly share code, notes, and snippets.

@sugiana
Last active June 17, 2026 06:05
Show Gist options
  • Select an option

  • Save sugiana/a5edc7c2d2e991973bc19da5fcf08c4d to your computer and use it in GitHub Desktop.

Select an option

Save sugiana/a5edc7c2d2e991973bc19da5fcf08c4d to your computer and use it in GitHub Desktop.
Simpan riwayat percakapan dalam JSON ke Chroma
# pip install langchain-text-splitters langchain-chroma langchain-ollama ollama flashrank
"""
Contoh riwayat percakapan dalam file JSON:
[
{
"id": 1,
"timestamp": "2022-07-12 05:55",
"sender": "Owo Sugiana",
"message": "Install ulang WA restore backup kemarin 2,3 MB. Padahal 2 jam sebelumnya tertulis 1 GB 17 Juni. Well ..."
},
{
"id": 2,
"timestamp": "2022-07-12 07:15",
"sender": "Tomy Suryawan",
"message": "Keburu ke backup barangkali om owo"
}
]
"""
import sys
import os
import json
from argparse import ArgumentParser
from time import time
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from flashrank import (
Ranker,
RerankRequest,
)
import ollama
LLM_OPTIONS = dict(temperature=0)
FLASHRANK_DIR = "flashrank"
embedding_model = "nomic-embed-text-v2-moe"
help_embedding = f"default {embedding_model}"
llm_model = "gemma4:e4b"
help_llm = f"default {llm_model}"
prompt = "Dokumen ini tentang apa ?"
help_prompt = f"default \"{prompt}\""
# Untuk Embedding
initial_doc_count = 25
help_initial_doc_count = (
f"default {initial_doc_count}, jumlah dokumen untuk Embedding")
# Untuk LLM usai Rerank mengurutkannya
best_doc_count = 10
help_best_doc_count = (
f"default {best_doc_count}, jumlah dokumen yang paling terkait")
chunk_size = 5000
help_chunk = f"default {chunk_size}, saat sebuah pesan terlalu panjang"
batch_size = 100
help_batch = f"default {batch_size}, jumlah dokumen yang dikirim ke Ollama"
start_id = 1
help_start = f"default {start_id}"
help_source = "JSON file atau Chroma directory"
pars = ArgumentParser()
pars.add_argument("--source", required=True, help=help_source)
pars.add_argument(
"--batch-size", type=int, default=batch_size, help=help_batch)
pars.add_argument(
"--start-id", type=int, default=start_id, help=help_start)
pars.add_argument("--prompt", default=prompt, help=help_prompt)
pars.add_argument(
"--initial-doc-count", type=int, default=initial_doc_count,
help=help_initial_doc_count)
pars.add_argument(
"--best-doc-count", type=int, default=best_doc_count,
help=help_best_doc_count)
pars.add_argument("--show-system-prompt", action="store_true")
pars.add_argument(
"--embedding-model", default=embedding_model, help=help_embedding)
pars.add_argument(
"--chunk-size", type=int, default=chunk_size, help=help_chunk)
pars.add_argument(
"--llm-model", default=llm_model, help=help_llm)
option = pars.parse_args(sys.argv[1:])
if not os.path.exists(option.source):
print(f"File {option.source} tidak ada.")
sys.exit(1)
# Dapatkan kapasitas context LLM
models = ollama.list()
models = models["models"]
for m in models:
name = m["model"]
if name != option.llm_model:
continue
info = ollama.show(name)
for key, val in info.modelinfo.items():
if key.find("context_length") > -1:
LLM_OPTIONS["num_ctx"] = val
break
if "num_ctx" in LLM_OPTIONS:
break
# Apakah ada message ID yang gagal dibaca oleh embedding ?
name, _ = os.path.splitext(option.source)
bad_id_file = f"{name}.bad"
if os.path.exists(bad_id_file):
with open(bad_id_file) as f:
s = f.read()
black_list = [int(x) for x in s.split()]
else:
black_list = []
print(f"Mempersiapkan embbeding model {option.embedding_model} ...")
embed_model = OllamaEmbeddings(model=option.embedding_model)
if os.path.isdir(option.source):
chroma_directory = option.source
print(f"{chroma_directory} dianggap Chroma directory.")
else: # JSON file
print(f"{option.source} dianggap file JSON percakapan.")
short_name = os.path.split(option.source)[-1]
short_name = os.path.splitext(short_name)[0]
chroma_directory = f"rag_{short_name}"
if os.path.exists(chroma_directory):
print(
f"Direktori {chroma_directory} sudah ada, "
"akan digunakan sebagai pengetahuan.")
else:
print(
"Akan disimpan dalam format vektor di "
f"direktori {chroma_directory} ...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=option.chunk_size, chunk_overlap=100)
with open(option.source) as f:
d_lines = json.load(f)
print(f"Simpan di direktori {chroma_directory} ...")
vector_store = None
docs = []
for item in d_lines:
message_id = item["id"]
if message_id < option.start_id:
continue
if message_id in black_list:
continue
sender = item["sender"]
message = item["message"]
timestamp = item["timestamp"]
splits = text_splitter.create_documents([message])
if splits[1:]:
for idx, doc in enumerate(splits):
new_message_id = f"{message_id}_{idx}"
content = f"{sender}: {doc.page_content}"
size = len(content)
print("ID", new_message_id, timestamp, size, "byte")
print([content])
metadata = dict(
message_id=new_message_id, timestamp=timestamp,
sender=sender, original_message=message)
new_doc = Document(page_content=content, metadata=metadata)
docs.append(new_doc)
else:
metadata = dict(
message_id=message_id, timestamp=timestamp, sender=sender,
original_message=message)
content = f"{sender}: {message}"
size = len(content)
print("ID", message_id, timestamp, size, "byte")
print([content])
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
doc_count = len(docs)
if doc_count >= option.batch_size:
print(f"\nKirim {doc_count} ke Ollama ...\n")
if vector_store:
vector_store.add_documents(docs)
else:
vector_store = Chroma.from_documents(
documents=docs, embedding=embed_model,
persist_directory=chroma_directory)
docs = []
if docs:
if vector_store:
vector_store.add_documents(docs)
else:
vector_store = Chroma.from_documents(
documents=docs, embedding=embed_model,
persist_directory=chroma_directory)
print("Selesai.")
print(f"\nPertanyaan:\n\n{option.prompt}")
print(f"\nSiapkan embbeding model {option.embedding_model} ...")
vectorstore = Chroma(
persist_directory=chroma_directory,
embedding_function=embed_model)
print(f"Ambil {option.initial_doc_count} dokumen terkait ...")
base_retriever = vectorstore.as_retriever(
search_kwargs=dict(k=option.initial_doc_count))
raw_docs = base_retriever.invoke(option.prompt)
# Inisialisasi ranker (otomatis mengunduh model jika belum ada)
print("Siapkan Ranker ...")
ranker = Ranker(
model_name="ms-marco-MultiBERT-L-12", cache_dir=FLASHRANK_DIR)
# Format dokumen LangChain ke format yang dikenali FlashRank
print(f"Urutkan yang terbaik ...")
passages = [
{"id": idx, "text": d.page_content, "meta": d.metadata}
for idx, d in enumerate(raw_docs)
]
rerank_request = RerankRequest(query=option.prompt, passages=passages)
rerank_results = ranker.rerank(rerank_request)
print(f"Ambil {option.best_doc_count} dokumen terbaik ...")
top_results = rerank_results[:option.best_doc_count]
# Gabungkan teks dokumen yang sudah disaring untuk konteks LLM
context = "\n\n".join([r["text"] for r in top_results])
system_prompt = (
"Jawaban Anda harus semata-mata berasal dari riwayat percakapan ini:"
f"\n\n{context}\n\n"
"Jika informasi tidak tersedia maka jawaban harus berisi "
"kalimat persis ini: "
"\"Jawaban yang Anda cari tidak ditemukan dalam riwayat percakapan.\"")
messages = [
dict(role="system", content=system_prompt),
dict(role="user", content=option.prompt)]
print(f"Sampaikan konteks dan pertanyaan ke LLM model {option.llm_model} ...")
if option.show_system_prompt:
print("\n------ Begin System Prompt -----\n")
print(f"\n{system_prompt}\n")
print("\n------ End System Prompt -----\n")
print("\n------ Begin User Prompt -----\n")
print(f"\n{option.prompt}\n")
print("\n------ End User Prompt -----\n")
begin_time = time()
response = ollama.chat(
model=option.llm_model, messages=messages, options=LLM_OPTIONS)
duration = time() - begin_time
answer = response['message']['content']
print("\nJawaban:\n")
print(answer)
token_prompt = response["prompt_eval_count"]
token_output = response["eval_count"]
total_token = token_prompt + token_output
print(f"\n{total_token} token")
print(f"{int(duration)} detik")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment