Last active
June 17, 2026 06:05
-
-
Save sugiana/a5edc7c2d2e991973bc19da5fcf08c4d to your computer and use it in GitHub Desktop.
Simpan riwayat percakapan dalam JSON ke Chroma
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # pip install langchain-text-splitters langchain-chroma langchain-ollama ollama flashrank | |
| """ | |
| Contoh riwayat percakapan dalam file JSON: | |
| [ | |
| { | |
| "id": 1, | |
| "timestamp": "2022-07-12 05:55", | |
| "sender": "Owo Sugiana", | |
| "message": "Install ulang WA restore backup kemarin 2,3 MB. Padahal 2 jam sebelumnya tertulis 1 GB 17 Juni. Well ..." | |
| }, | |
| { | |
| "id": 2, | |
| "timestamp": "2022-07-12 07:15", | |
| "sender": "Tomy Suryawan", | |
| "message": "Keburu ke backup barangkali om owo" | |
| } | |
| ] | |
| """ | |
| import sys | |
| import os | |
| import json | |
| from argparse import ArgumentParser | |
| from time import time | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_chroma import Chroma | |
| from langchain_ollama import OllamaEmbeddings | |
| from flashrank import ( | |
| Ranker, | |
| RerankRequest, | |
| ) | |
| import ollama | |
| LLM_OPTIONS = dict(temperature=0) | |
| FLASHRANK_DIR = "flashrank" | |
| embedding_model = "nomic-embed-text-v2-moe" | |
| help_embedding = f"default {embedding_model}" | |
| llm_model = "gemma4:e4b" | |
| help_llm = f"default {llm_model}" | |
| prompt = "Dokumen ini tentang apa ?" | |
| help_prompt = f"default \"{prompt}\"" | |
| # Untuk Embedding | |
| initial_doc_count = 25 | |
| help_initial_doc_count = ( | |
| f"default {initial_doc_count}, jumlah dokumen untuk Embedding") | |
| # Untuk LLM usai Rerank mengurutkannya | |
| best_doc_count = 10 | |
| help_best_doc_count = ( | |
| f"default {best_doc_count}, jumlah dokumen yang paling terkait") | |
| chunk_size = 5000 | |
| help_chunk = f"default {chunk_size}, saat sebuah pesan terlalu panjang" | |
| batch_size = 100 | |
| help_batch = f"default {batch_size}, jumlah dokumen yang dikirim ke Ollama" | |
| start_id = 1 | |
| help_start = f"default {start_id}" | |
| help_source = "JSON file atau Chroma directory" | |
| pars = ArgumentParser() | |
| pars.add_argument("--source", required=True, help=help_source) | |
| pars.add_argument( | |
| "--batch-size", type=int, default=batch_size, help=help_batch) | |
| pars.add_argument( | |
| "--start-id", type=int, default=start_id, help=help_start) | |
| pars.add_argument("--prompt", default=prompt, help=help_prompt) | |
| pars.add_argument( | |
| "--initial-doc-count", type=int, default=initial_doc_count, | |
| help=help_initial_doc_count) | |
| pars.add_argument( | |
| "--best-doc-count", type=int, default=best_doc_count, | |
| help=help_best_doc_count) | |
| pars.add_argument("--show-system-prompt", action="store_true") | |
| pars.add_argument( | |
| "--embedding-model", default=embedding_model, help=help_embedding) | |
| pars.add_argument( | |
| "--chunk-size", type=int, default=chunk_size, help=help_chunk) | |
| pars.add_argument( | |
| "--llm-model", default=llm_model, help=help_llm) | |
| option = pars.parse_args(sys.argv[1:]) | |
| if not os.path.exists(option.source): | |
| print(f"File {option.source} tidak ada.") | |
| sys.exit(1) | |
| # Dapatkan kapasitas context LLM | |
| models = ollama.list() | |
| models = models["models"] | |
| for m in models: | |
| name = m["model"] | |
| if name != option.llm_model: | |
| continue | |
| info = ollama.show(name) | |
| for key, val in info.modelinfo.items(): | |
| if key.find("context_length") > -1: | |
| LLM_OPTIONS["num_ctx"] = val | |
| break | |
| if "num_ctx" in LLM_OPTIONS: | |
| break | |
| # Apakah ada message ID yang gagal dibaca oleh embedding ? | |
| name, _ = os.path.splitext(option.source) | |
| bad_id_file = f"{name}.bad" | |
| if os.path.exists(bad_id_file): | |
| with open(bad_id_file) as f: | |
| s = f.read() | |
| black_list = [int(x) for x in s.split()] | |
| else: | |
| black_list = [] | |
| print(f"Mempersiapkan embbeding model {option.embedding_model} ...") | |
| embed_model = OllamaEmbeddings(model=option.embedding_model) | |
| if os.path.isdir(option.source): | |
| chroma_directory = option.source | |
| print(f"{chroma_directory} dianggap Chroma directory.") | |
| else: # JSON file | |
| print(f"{option.source} dianggap file JSON percakapan.") | |
| short_name = os.path.split(option.source)[-1] | |
| short_name = os.path.splitext(short_name)[0] | |
| chroma_directory = f"rag_{short_name}" | |
| if os.path.exists(chroma_directory): | |
| print( | |
| f"Direktori {chroma_directory} sudah ada, " | |
| "akan digunakan sebagai pengetahuan.") | |
| else: | |
| print( | |
| "Akan disimpan dalam format vektor di " | |
| f"direktori {chroma_directory} ...") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=option.chunk_size, chunk_overlap=100) | |
| with open(option.source) as f: | |
| d_lines = json.load(f) | |
| print(f"Simpan di direktori {chroma_directory} ...") | |
| vector_store = None | |
| docs = [] | |
| for item in d_lines: | |
| message_id = item["id"] | |
| if message_id < option.start_id: | |
| continue | |
| if message_id in black_list: | |
| continue | |
| sender = item["sender"] | |
| message = item["message"] | |
| timestamp = item["timestamp"] | |
| splits = text_splitter.create_documents([message]) | |
| if splits[1:]: | |
| for idx, doc in enumerate(splits): | |
| new_message_id = f"{message_id}_{idx}" | |
| content = f"{sender}: {doc.page_content}" | |
| size = len(content) | |
| print("ID", new_message_id, timestamp, size, "byte") | |
| print([content]) | |
| metadata = dict( | |
| message_id=new_message_id, timestamp=timestamp, | |
| sender=sender, original_message=message) | |
| new_doc = Document(page_content=content, metadata=metadata) | |
| docs.append(new_doc) | |
| else: | |
| metadata = dict( | |
| message_id=message_id, timestamp=timestamp, sender=sender, | |
| original_message=message) | |
| content = f"{sender}: {message}" | |
| size = len(content) | |
| print("ID", message_id, timestamp, size, "byte") | |
| print([content]) | |
| doc = Document(page_content=content, metadata=metadata) | |
| docs.append(doc) | |
| doc_count = len(docs) | |
| if doc_count >= option.batch_size: | |
| print(f"\nKirim {doc_count} ke Ollama ...\n") | |
| if vector_store: | |
| vector_store.add_documents(docs) | |
| else: | |
| vector_store = Chroma.from_documents( | |
| documents=docs, embedding=embed_model, | |
| persist_directory=chroma_directory) | |
| docs = [] | |
| if docs: | |
| if vector_store: | |
| vector_store.add_documents(docs) | |
| else: | |
| vector_store = Chroma.from_documents( | |
| documents=docs, embedding=embed_model, | |
| persist_directory=chroma_directory) | |
| print("Selesai.") | |
| print(f"\nPertanyaan:\n\n{option.prompt}") | |
| print(f"\nSiapkan embbeding model {option.embedding_model} ...") | |
| vectorstore = Chroma( | |
| persist_directory=chroma_directory, | |
| embedding_function=embed_model) | |
| print(f"Ambil {option.initial_doc_count} dokumen terkait ...") | |
| base_retriever = vectorstore.as_retriever( | |
| search_kwargs=dict(k=option.initial_doc_count)) | |
| raw_docs = base_retriever.invoke(option.prompt) | |
| # Inisialisasi ranker (otomatis mengunduh model jika belum ada) | |
| print("Siapkan Ranker ...") | |
| ranker = Ranker( | |
| model_name="ms-marco-MultiBERT-L-12", cache_dir=FLASHRANK_DIR) | |
| # Format dokumen LangChain ke format yang dikenali FlashRank | |
| print(f"Urutkan yang terbaik ...") | |
| passages = [ | |
| {"id": idx, "text": d.page_content, "meta": d.metadata} | |
| for idx, d in enumerate(raw_docs) | |
| ] | |
| rerank_request = RerankRequest(query=option.prompt, passages=passages) | |
| rerank_results = ranker.rerank(rerank_request) | |
| print(f"Ambil {option.best_doc_count} dokumen terbaik ...") | |
| top_results = rerank_results[:option.best_doc_count] | |
| # Gabungkan teks dokumen yang sudah disaring untuk konteks LLM | |
| context = "\n\n".join([r["text"] for r in top_results]) | |
| system_prompt = ( | |
| "Jawaban Anda harus semata-mata berasal dari riwayat percakapan ini:" | |
| f"\n\n{context}\n\n" | |
| "Jika informasi tidak tersedia maka jawaban harus berisi " | |
| "kalimat persis ini: " | |
| "\"Jawaban yang Anda cari tidak ditemukan dalam riwayat percakapan.\"") | |
| messages = [ | |
| dict(role="system", content=system_prompt), | |
| dict(role="user", content=option.prompt)] | |
| print(f"Sampaikan konteks dan pertanyaan ke LLM model {option.llm_model} ...") | |
| if option.show_system_prompt: | |
| print("\n------ Begin System Prompt -----\n") | |
| print(f"\n{system_prompt}\n") | |
| print("\n------ End System Prompt -----\n") | |
| print("\n------ Begin User Prompt -----\n") | |
| print(f"\n{option.prompt}\n") | |
| print("\n------ End User Prompt -----\n") | |
| begin_time = time() | |
| response = ollama.chat( | |
| model=option.llm_model, messages=messages, options=LLM_OPTIONS) | |
| duration = time() - begin_time | |
| answer = response['message']['content'] | |
| print("\nJawaban:\n") | |
| print(answer) | |
| token_prompt = response["prompt_eval_count"] | |
| token_output = response["eval_count"] | |
| total_token = token_prompt + token_output | |
| print(f"\n{total_token} token") | |
| print(f"{int(duration)} detik") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment