Last active
October 16, 2022 21:52
-
-
Save hypernova7/9f3586a681c5aa6fc404ccb909f03f7b to your computer and use it in GitHub Desktop.
Telegram OCR Bot
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Telegraf } from 'telegraf' | |
import { getTextFromImage, langs } from './ocr' | |
const bot = new Telegraf(process.env.BOT_TOKEN) | |
bot.command('ocr', async ctx => { | |
const reply_to = ctx.message.reply_to_message | |
const query = ctx.message.text.replce('/ocr', '').trim() | |
if (query === 'langs') { | |
let list = ''; | |
for (const lng in langs) { | |
list += `\u2014 <b>${lng}</b>: ${langs[lng]}\n`; | |
} | |
return ctx.replyWithHTML(`List of available languages:\n\n${list}`) | |
} | |
if (!langs[query]) { | |
return ctx.replyWithHTML(`<b>Language not found</b>\nSend <code>/ocr langs</code> to see a list of available languages`); | |
} | |
if (reply_to && reply_to.photo) { // check if the replied message contains an image or photo | |
const { file_id } = reply_to.photo.pop(); // get the best image quality | |
const url = await ctx.telegram.getFileLink(file_id); // get file link to download in memory | |
const text = await getTextFromImage(ctx.from.language_code, url); // get text from image | |
// send reply with image text | |
ctx.reply(text, { | |
reply_to_message_id: reply_to.message_id, | |
allow_sending_without_reply: true | |
}) | |
} | |
}) | |
bot.launch() // start bot |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { resolve } from 'node:path'; | |
import { readFile } from 'node:fs/promises'; | |
import { createOCRClient } from 'tesseract-wasm/node'; | |
import sharp from 'sharp'; | |
import got from 'got'; | |
// list of available languages | |
export const langs = { | |
afr: 'Afrikaans', | |
amh: 'Amharic', | |
ara: 'Arabic', | |
asm: 'Assamese', | |
aze: 'Azerbaijani', | |
aze_cyrl: 'Azerbaijani - Cyrilic', | |
bel: 'Belarusian', | |
ben: 'Bengali', | |
bod: 'Tibetan', | |
bos: 'Bosnian', | |
bre: 'Breton', | |
bul: 'Bulgarian', | |
cat: 'Catalan; Valencian', | |
ceb: 'Cebuano', | |
ces: 'Czech', | |
chi_sim: 'Chinese - Simplified', | |
chi_tra: 'Chinese - Traditional', | |
chr: 'Cherokee', | |
cos: 'Corsican', | |
cym: 'Welsh', | |
dan: 'Danish', | |
dan_frak: 'Danish - Fraktur (contrib)', | |
deu: 'German', | |
deu_frak: 'German - Fraktur (contrib)', | |
dzo: 'Dzongkha', | |
ell: 'Greek, Modern (1453-)', | |
eng: 'English', | |
enm: 'English, Middle (1100-1500)', | |
epo: 'Esperanto', | |
equ: 'Math / equation detection module', | |
est: 'Estonian', | |
eus: 'Basque', | |
fao: 'Faroese', | |
fas: 'Persian', | |
fil: 'Filipino (old - Tagalog)', | |
fin: 'Finnish', | |
fra: 'French', | |
frk: 'German - Fraktur', | |
frm: 'French, Middle (ca.1400-1600)', | |
fry: 'Western Frisian', | |
gla: 'Scottish Gaelic', | |
gle: 'Irish', | |
glg: 'Galician', | |
grc: 'Greek, Ancient (to 1453) (contrib)', | |
guj: 'Gujarati', | |
hat: 'Haitian; Haitian Creole', | |
heb: 'Hebrew', | |
hin: 'Hindi', | |
hrv: 'Croatian', | |
hun: 'Hungarian', | |
hye: 'Armenian', | |
iku: 'Inuktitut', | |
ind: 'Indonesian', | |
isl: 'Icelandic', | |
ita: 'Italian', | |
ita_old: 'Italian - Old', | |
jav: 'Javanese', | |
jpn: 'Japanese', | |
kan: 'Kannada', | |
kat: 'Georgian', | |
kat_old: 'Georgian - Old', | |
kaz: 'Kazakh', | |
khm: 'Central Khmer', | |
kir: 'Kirghiz; Kyrgyz', | |
kmr: 'Kurmanji (Kurdish - Latin Script)', | |
kor: 'Korean', | |
kor_vert: 'Korean (vertical)', | |
kur: 'Kurdish (Arabic Script)', | |
lao: 'Lao', | |
lat: 'Latin', | |
lav: 'Latvian', | |
lit: 'Lithuanian', | |
ltz: 'Luxembourgish', | |
mal: 'Malayalam', | |
mar: 'Marathi', | |
mkd: 'Macedonian', | |
mlt: 'Maltese', | |
mon: 'Mongolian', | |
mri: 'Maori', | |
msa: 'Malay', | |
mya: 'Burmese', | |
nep: 'Nepali', | |
nld: 'Dutch; Flemish', | |
nor: 'Norwegian', | |
oci: 'Occitan (post 1500)', | |
ori: 'Oriya', | |
osd: 'Orientation and script detection module', | |
pan: 'Panjabi; Punjabi', | |
pol: 'Polish', | |
por: 'Portuguese', | |
pus: 'Pushto; Pashto', | |
que: 'Quechua', | |
ron: 'Romanian; Moldavian; Moldovan', | |
rus: 'Russian', | |
san: 'Sanskrit', | |
sin: 'Sinhala; Sinhalese', | |
slk: 'Slovak', | |
slk_frak: 'Slovak - Fraktur (contrib)', | |
slv: 'Slovenian', | |
snd: 'Sindhi', | |
spa: 'Spanish; Castilian', | |
spa_old: 'Spanish; Castilian - Old', | |
sqi: 'Albanian', | |
srp: 'Serbian', | |
srp_latn: 'Serbian - Latin', | |
sun: 'Sundanese', | |
swa: 'Swahili', | |
swe: 'Swedish', | |
syr: 'Syriac', | |
tam: 'Tamil', | |
tat: 'Tatar', | |
tel: 'Telugu', | |
tgk: 'Tajik', | |
tgl: 'Tagalog (new - Filipino)', | |
tha: 'Thai', | |
tir: 'Tigrinya', | |
ton: 'Tonga', | |
tur: 'Turkish', | |
uig: 'Uighur; Uyghur', | |
ukr: 'Ukrainian', | |
urd: 'Urdu', | |
uzb: 'Uzbek', | |
uzb_cyrl: 'Uzbek - Cyrilic', | |
vie: 'Vietnamese', | |
yid: 'Yiddish', | |
yor: 'Yoruba' | |
}; | |
// create abbreviation list of languages | |
const abbrLangs = {}; | |
for (const lang in langs) { | |
abbrLangs[lang] = lang; | |
} | |
export async function getTextFromImage (lang = 'eng', image) { | |
lang = lang && lang.length > 0 ? abbrLangs[lang] || 'eng' : 'eng'; | |
let text; | |
const client = createOCRClient(); // initialize tesseract | |
const modelURL = `https://github.com/tesseract-ocr/tessdata_fast/raw/main/${lang}.traineddata`; | |
try { | |
const model = await got(modelURL).buffer(); // get model | |
// check if a url or local file and get buffer | |
const buffer = await (image.startsWith('http') | |
? got(image).buffer() | |
: readFile(resolve(__dirname, image))); | |
const img = await sharp(buffer).ensureAlpha(); // improve image quality fro better readability | |
const { width, height } = await img.metadata(); | |
const data = await img.raw().toBuffer(); | |
await client.loadModel(model); // load model | |
// load image | |
await client.loadImage({ | |
data, | |
width, | |
height | |
}); | |
text = await client.getText(); // get text from image | |
} catch (e) { | |
} finally { | |
client.destroy(); // destroy tesseract process | |
} | |
return text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment