Skip to content

Instantly share code, notes, and snippets.

@terrymccann
Last active March 30, 2025 15:00
Show Gist options
  • Save terrymccann/7a22f314ded823147bd69cfbcae9c025 to your computer and use it in GitHub Desktop.
Save terrymccann/7a22f314ded823147bd69cfbcae9c025 to your computer and use it in GitHub Desktop.
Using LangChain, take a markdown file, chunk it, create embeddings, save it to a pinecone vector store.
OPENAI_API_KEY="your openai api key"
PINECONE_API_KEY="your pinecone api key"
PINECONE_INDEX="name of your pinecone index"
import fs from "fs";
import path from "path";
import dotenv from "dotenv";
dotenv.config();
import { MarkdownTextSplitter, RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { OpenAIEmbeddings } from "@langchain/openai";
import { Document } from "langchain/document";
import { PineconeStore } from "@langchain/pinecone";
import { Pinecone } from "@pinecone-database/pinecone";
async function main() {
// 1. Read the Markdown file
const markdownPath = path.join(process.cwd(), "data", "test.md");
const markdownContent = fs.readFileSync(markdownPath, "utf-8");
// 2. Split by Markdown headers using MarkdownTextSplitter
const mdSplitter = new MarkdownTextSplitter({
chunkOverlap: 0,
chunkSize: 1000, // target chunk size (can be adjusted)
});
const initialChunks = await mdSplitter.splitText(markdownContent);
// 3. For any chunk > 500 characters, further split using RecursiveCharacterTextSplitter
const finalChunks = [];
const recursiveSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 50,
separators: ["\n\n", "\n", " "],
});
for (const chunk of initialChunks) {
if (chunk.length > 500) {
const subChunks = await recursiveSplitter.splitText(chunk);
finalChunks.push(...subChunks);
} else {
finalChunks.push(chunk);
}
}
// Wrap each final chunk as a Document object
const documents = finalChunks.map((text) => new Document({ pageContent: text }));
console.log(`Number of final document chunks: ${documents.length}`);
// 4. Create embeddings using OpenAIEmbeddings
const embeddingsModel = new OpenAIEmbeddings({ modelName: "text-embedding-3-small" });
// 5. Initialize the Pinecone client
const pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY,
});
const pineconeIndex = pinecone.Index(process.env.PINECONE_INDEX);
// 6. Create a Pinecone vector store from the documents
const vectorStore = await PineconeStore.fromDocuments(documents, embeddingsModel, {
pineconeIndex: pineconeIndex,
namespace: 'aws' // this is optional. You can leave it out or call it whatever you want
});
console.log(`Embeddings upserted successfully into index "${pineconeIndex}"`);
}
main().catch(console.error);
{
"type": "module",
"dependencies": {
"@langchain/openai": "^0.4.4",
"@langchain/pinecone": "^0.1.3",
"@pinecone-database/pinecone": "^4.1.0",
"dotenv": "^16.4.7",
"langchain": "^0.3.15",
"openai": "^4.85.1"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment