-
-
Save kltng/c25422538e15e155bccef0e289ea3faa to your computer and use it in GitHub Desktop.
Google Apps script for performing OCR on all JPEGS found in the specified Drive folder. Extracts text to a Google sheet, where it's mapped to the JPEG's filename.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function extractTextOnOpen() { | |
//ADD YOUR VALUES BELOW | |
var folderName = "[YOUR PROJECT FOLDER]"; | |
var sheetId = "[YOUR SHEET ID]"; | |
//Define folder | |
var folder = DriveApp.getFoldersByName(folderName).next(); | |
var folderId = folder.getId(); | |
//Find all jpegs in folder | |
var images = folder.getFilesByType("image/jpeg"); | |
while (images.hasNext()) { | |
//Convert each jpeg to a Google Doc with OCR | |
var image = images.next(); | |
var imageName = image.getName(); | |
var docName = imageName.split("\.")[0]; | |
var file = { | |
title: docName, | |
mimeType: "image/jpeg" | |
} | |
Drive.Files.insert(file, image, { ocr: true }); | |
//Store newly-created Google Doc in project folder | |
var newFile = DriveApp.getFilesByName(docName).next(); | |
folder.addFile(newFile); | |
var rootFolder = DriveApp.getRootFolder(); | |
rootFolder.removeFile(newFile); | |
} | |
//Find all Google Docs in folder | |
var docs = folder.getFilesByType("application/vnd.google-apps.document"); | |
//Set up spreadsheet | |
var ss = SpreadsheetApp.openById(sheetId); | |
SpreadsheetApp.setActiveSpreadsheet(ss); | |
Logger.log('File name: ' + ss.getName()); | |
var sheet = SpreadsheetApp.getActiveSheet(); | |
sheet.clear(); | |
sheet.appendRow(["Filename", "Text"]); | |
//Populate spreadsheet with OCR text | |
while (docs.hasNext()) { | |
var file = docs.next(); | |
var docId = file.getId(); | |
var doc = DocumentApp.openById(docId); | |
var name = doc.getName(); | |
var body = doc.getBody().getText(); | |
//Add item data to spreadsheet | |
sheet.appendRow([name, body]); | |
} | |
}; |
Striking. Thanks!
This worked for me thank you! However, I am only wanting to extract certain text from the OCR into my sheet. The JPEG contains lots of information but I want it to selectively put only certain information into the google sheet... does anyone know how I would do this? Thank you
You can try regular expressions if the data has certain patterns.
hi can you help mw with my error "Exception: Cannot retrieve the next object: iterator has reached the end."
not working for me: returns "Exception: Cannot retrieve the next object: iterator has reached the end.
extractTextOnOpen " :(
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
That's great. Thank you.
Do you know if there is a way to extract vertical japanese text also? I have managed to extract japanese, but it is always detected as horizontal text.