Last active
November 15, 2024 09:58
-
-
Save dahse89/81cfe004d1fc2a2975d68b0b66bb9a68 to your computer and use it in GitHub Desktop.
This script is a Google Action that renames PDF files in a designated folder based on their content. It leverages Google Drive OCR and OpenAI Complete API. To use this script, you will need an OpenAI token and to add the Drive API Service. Additionally, it's recommended to update the GPT prompt to make it more adaptable to different use cases.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const openAiApiKey = '***' | |
const openAiCompletionApiUrl = 'https://api.openai.com/v1/completions'; | |
const additionalPromptCharsLimit = 1600; | |
function myFunction() { | |
const idOfFolderContainingPdfs = '{ADD ID HERE}' | |
const idOfDestinationFolder = '{ADD ID HERE}' | |
const SourceFolder = DriveApp.getFolderById(idOfFolderContainingPdfs) | |
const DestinationFolder = DriveApp.getFolderById(idOfDestinationFolder) | |
const Files = SourceFolder.getFiles(); | |
while(Files.hasNext()) { | |
const file = Files.next() | |
const fileName = file.getName() | |
Logger.log('Found file ' + fileName) | |
const newFileName = createPdfFileName(file); | |
DestinationFolder.addFile(file.makeCopy(newFileName)); | |
// file.setTrashed(true) | |
Logger.log(newFileName + " created") | |
} | |
} | |
function createPdfFileName(pdfDocument) { | |
let text = convertPDFToText(pdfDocument, 'en') | |
text = text.replace(/[\n\r\t\s]+/g, ' ') | |
if (text.length > additionalPromptCharsLimit) { | |
text = text.slice(0, additionalPromptCharsLimit); | |
} | |
Logger.log("OCR (first " + additionalPromptCharsLimit + " chars): " + text) | |
const title = gptCompleteTitleByText(text) | |
return `${title}.pdf` | |
} | |
function gptCompleteTitleByText(text) { | |
const prompt = 'A title should contain the short topic of the text, the date (year is sufficient) and' | |
+ 'the company name. If Alice or Bob or both are mentioned, it should be in the title.\n\n' | |
+ `Text: "${text.trim()}"\n\n` | |
+ 'Title:' | |
const options = { | |
'method': 'POST', | |
'headers': { | |
'Authorization': `Bearer ${openAiApiKey}`, | |
'Content-Type': 'application/json' | |
}, | |
'payload': JSON.stringify({ | |
'model': 'text-davinci-003', | |
'prompt': prompt, | |
'max_tokens': 90, | |
'temperature': 0 | |
}) | |
}; | |
const response = UrlFetchApp.fetch(openAiCompletionApiUrl, options); | |
const responseData = JSON.parse(response.getContentText()); | |
if (responseData && responseData.choices && responseData.choices.length > 0) { | |
let title = responseData.choices[0].text; | |
title = title.trim().replace(/^['"]|['"]$/g, '') | |
Logger.log('GPT title: ' + title) | |
return title; | |
} | |
Logger.log('Failed to generate text:', response.getContentText()); | |
} | |
function convertPDFToText(pdfDocument, language) { | |
const { id } = Drive.Files.insert( | |
{ | |
title: pdfDocument.getName().replace(/\.pdf$/, ''), | |
mimeType: pdfDocument.getMimeType() || 'application/pdf', | |
}, | |
pdfDocument.getBlob(), | |
{ | |
ocr: true, | |
ocrLanguage: language, | |
fields: 'id', | |
} | |
); | |
const textContent = DocumentApp.openById(id).getBody().getText(); | |
DriveApp.getFileById(id).setTrashed(true); | |
return textContent; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment