-
-
Save terrymccann/7a22f314ded823147bd69cfbcae9c025 to your computer and use it in GitHub Desktop.
Using LangChain, take a markdown file, chunk it, create embeddings, save it to a pinecone vector store.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
OPENAI_API_KEY="your openai api key" | |
PINECONE_API_KEY="your pinecone api key" | |
PINECONE_INDEX="name of your pinecone index" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fs from "fs"; | |
import path from "path"; | |
import dotenv from "dotenv"; | |
dotenv.config(); | |
import { MarkdownTextSplitter, RecursiveCharacterTextSplitter } from "langchain/text_splitter"; | |
import { OpenAIEmbeddings } from "@langchain/openai"; | |
import { Document } from "langchain/document"; | |
import { PineconeStore } from "@langchain/pinecone"; | |
import { Pinecone } from "@pinecone-database/pinecone"; | |
async function main() { | |
// 1. Read the Markdown file | |
const markdownPath = path.join(process.cwd(), "data", "test.md"); | |
const markdownContent = fs.readFileSync(markdownPath, "utf-8"); | |
// 2. Split by Markdown headers using MarkdownTextSplitter | |
const mdSplitter = new MarkdownTextSplitter({ | |
chunkOverlap: 0, | |
chunkSize: 1000, // target chunk size (can be adjusted) | |
}); | |
const initialChunks = await mdSplitter.splitText(markdownContent); | |
// 3. For any chunk > 500 characters, further split using RecursiveCharacterTextSplitter | |
const finalChunks = []; | |
const recursiveSplitter = new RecursiveCharacterTextSplitter({ | |
chunkSize: 500, | |
chunkOverlap: 50, | |
separators: ["\n\n", "\n", " "], | |
}); | |
for (const chunk of initialChunks) { | |
if (chunk.length > 500) { | |
const subChunks = await recursiveSplitter.splitText(chunk); | |
finalChunks.push(...subChunks); | |
} else { | |
finalChunks.push(chunk); | |
} | |
} | |
// Wrap each final chunk as a Document object | |
const documents = finalChunks.map((text) => new Document({ pageContent: text })); | |
console.log(`Number of final document chunks: ${documents.length}`); | |
// 4. Create embeddings using OpenAIEmbeddings | |
const embeddingsModel = new OpenAIEmbeddings({ modelName: "text-embedding-3-small" }); | |
// 5. Initialize the Pinecone client | |
const pinecone = new Pinecone({ | |
apiKey: process.env.PINECONE_API_KEY, | |
}); | |
const pineconeIndex = pinecone.Index(process.env.PINECONE_INDEX); | |
// 6. Create a Pinecone vector store from the documents | |
const vectorStore = await PineconeStore.fromDocuments(documents, embeddingsModel, { | |
pineconeIndex: pineconeIndex, | |
namespace: 'aws' // this is optional. You can leave it out or call it whatever you want | |
}); | |
console.log(`Embeddings upserted successfully into index "${pineconeIndex}"`); | |
} | |
main().catch(console.error); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"type": "module", | |
"dependencies": { | |
"@langchain/openai": "^0.4.4", | |
"@langchain/pinecone": "^0.1.3", | |
"@pinecone-database/pinecone": "^4.1.0", | |
"dotenv": "^16.4.7", | |
"langchain": "^0.3.15", | |
"openai": "^4.85.1" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment