Created
May 19, 2019 05:55
-
-
Save korkridake/39d72613961c65c93b066479c305c808 to your computer and use it in GitHub Desktop.
Tesseract Optical Character Recognition (OCR) Engine of Tesco Receipts (Ep.1)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################## | |
# Tesseract Optical Character Recognition (OCR) Engine of Tesco Receipts (Ep.1) | |
# Author: @Kyle Akepanidtaworn | |
# Source Code: Using the Tesseract OCR engine in R (2018) | |
# Created Date: 5/19/2019 | |
############################################################################## | |
# install.packages("tesseract") | |
library(tesseract) | |
eng <- tesseract("eng") | |
############################################################################## | |
# Call OCR engine to use with my Tesco receipt | |
# Extract texts from images | |
############################################################################## | |
text <- tesseract::ocr("01-Tesco-Receipt-Image.jpg", engine = eng) | |
cat(text) | |
' | |
\ | |
II-.. | |
BICESTER 2 0845 677 9063 | |
Q | |
FRESH MILK 0.81 | |
S/BERRV CONSV 0.96 | |
VIC PLUM CON 1.65 | |
B/BERRV CONSVE 1.38 | |
188 LOAF 0.70 | |
ORG AVOCADO‘S 1.69 | |
BLUEBERRIES 1.49 | |
CARROTS 1K0 059 | |
ORG SWT PEPPRS 1.69 | |
ONION SHALLOTS 0.63 | |
AUBERGINES | |
0.290 kg 0 £2.99/ kg 0.87 | |
GRAPE N.SDLESS | |
0.595 kg 0 2248/ kg 1.48 | |
SUB’TOTAL 13.94 | |
MULTIBUY SAVINGS | |
PREPACKED PRODUCE 5 FOR 4 *0.59 | |
TOTAL SAVINGS ,059 | |
TOTAL TO PM 13.35 | |
CASH 20.00 | |
CHANGE DUE 6.65 | |
SIGN UP FOR CLUBCARD | |
Vnu coqu have earned 13 | |
Clubcard pmnts in this transactwon | |
‘ ‘IHIIIIIIlllllllllllllllllHllllllll | |
HIIHIIIIMIIHII ! | |
24/05/05 13:52 2113 007 1024 1048 | |
' | |
############################################################################## | |
# The ocr_data() function returns all words in the image along | |
# with a bounding box and confidence rate. | |
############################################################################## | |
results <- tesseract::ocr_data("01-Tesco-Receipt-Image.jpg", engine = eng) | |
results | |
' | |
# A tibble: 100 x 3 | |
word confidence bbox | |
<chr> <dbl> <chr> | |
1 "\\" 65.5 0,0,1,17 | |
2 II-.. 68.8 141,150,308,160 | |
3 BICESTER 74.8 118,164,193,179 | |
4 2 82.6 204,165,212,179 | |
5 0845 70.4 224,164,260,179 | |
6 677 80.8 272,164,299,179 | |
7 9063 76.0 310,164,347,179 | |
8 Q 60.2 386,224,403,238 | |
9 FRESH 79.6 49,244,96,259 | |
10 MILK 68.8 107,244,144,258 | |
# ... with 90 more rows | |
' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment