Created
February 3, 2024 13:13
-
-
Save leifermendez/c8705028e3a2a39704a59a395c001cb5 to your computer and use it in GitHub Desktop.
customPDFLoader.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Document } from 'langchain/document'; | |
import { readFile } from 'fs/promises'; | |
import { BaseDocumentLoader } from 'langchain/document_loaders'; | |
export abstract class BufferLoader extends BaseDocumentLoader { | |
constructor(public filePathOrBlob: string | Blob) { | |
super(); | |
} | |
protected abstract parse( | |
raw: Buffer, | |
metadata: Document['metadata'], | |
): Promise<Document[]>; | |
public async load(): Promise<Document[]> { | |
let buffer: Buffer; | |
let metadata: Record<string, string>; | |
if (typeof this.filePathOrBlob === 'string') { | |
buffer = await readFile(this.filePathOrBlob); | |
metadata = { source: this.filePathOrBlob }; | |
} else { | |
buffer = await this.filePathOrBlob | |
.arrayBuffer() | |
.then((ab) => Buffer.from(ab)); | |
metadata = { source: 'blob', blobType: this.filePathOrBlob.type }; | |
} | |
return this.parse(buffer, metadata); | |
} | |
} | |
export class CustomPDFLoader extends BufferLoader { | |
public async parse( | |
raw: Buffer, | |
metadata: Document['metadata'], | |
): Promise<Document[]> { | |
const { pdf } = await PDFLoaderImports(); | |
const parsed = await pdf(raw); | |
return [ | |
new Document({ | |
pageContent: parsed.text, | |
metadata: { | |
...metadata, | |
pdf_numpages: parsed.numpages, | |
}, | |
}), | |
]; | |
} | |
} | |
async function PDFLoaderImports() { | |
try { | |
// the main entrypoint has some debug code that we don't want to import | |
const { default: pdf } = await import('pdf-parse/lib/pdf-parse.js'); | |
return { pdf }; | |
} catch (e) { | |
console.error(e); | |
throw new Error( | |
'Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`.', | |
); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment