hypernova7 · October 16, 2022 21:52
diff --git a/bot.js b/bot.js
 import { Telegraf } from 'telegraf'
 import { getTextFromImage, langs } from './ocr'

 const bot = new Telegraf(process.env.BOT_TOKEN)

 bot.command('ocr', async ctx => {
  const reply_to = ctx.message.reply_to_message
  const query = ctx.message.text.replce('/ocr', '').trim()
  
  if (query === 'langs') {
    let list = '';
    for (const lng in langs) {
      list += `\u2014 <b>${lng}</b>: ${langs[lng]}\n`;
    }
    return ctx.replyWithHTML(`List of available languages:\n\n${list}`)
  }
  
  if (!langs[query]) {
    return ctx.replyWithHTML(`<b>Language not found</b>\nSend <code>/ocr langs</code> to see a list of available languages`);
  }
  
  if (reply_to && reply_to.photo) { // check if the replied message contains an image or photo
    const { file_id } = reply_to.photo.pop(); // get the best image quality
    const url = await ctx.telegram.getFileLink(file_id); // get file link to download in memory
    const text = await getTextFromImage(ctx.from.language_code, url); // get text from image
    // send reply with image text
    ctx.reply(text, {
      reply_to_message_id: reply_to.message_id,
      allow_sending_without_reply: true
    })
  }
 })

 bot.launch() // start bot
diff --git a/ocr.js b/ocr.js
 import { resolve } from 'node:path';
 import { readFile } from 'node:fs/promises';
 import { createOCRClient } from 'tesseract-wasm/node';
 import sharp from 'sharp';
 import got from 'got';

 // list of available languages
 export const langs = {
  afr: 'Afrikaans',
  amh: 'Amharic',
  ara: 'Arabic',
  asm: 'Assamese',
  aze: 'Azerbaijani',
  aze_cyrl: 'Azerbaijani - Cyrilic',
  bel: 'Belarusian',
  ben: 'Bengali',
  bod: 'Tibetan',
  bos: 'Bosnian',
  bre: 'Breton',
  bul: 'Bulgarian',
  cat: 'Catalan; Valencian',
  ceb: 'Cebuano',
  ces: 'Czech',
  chi_sim: 'Chinese - Simplified',
  chi_tra: 'Chinese - Traditional',
  chr: 'Cherokee',
  cos: 'Corsican',
  cym: 'Welsh',
  dan: 'Danish',
  dan_frak: 'Danish - Fraktur (contrib)',
  deu: 'German',
  deu_frak: 'German - Fraktur (contrib)',
  dzo: 'Dzongkha',
  ell: 'Greek, Modern (1453-)',
  eng: 'English',
  enm: 'English, Middle (1100-1500)',
  epo: 'Esperanto',
  equ: 'Math / equation detection module',
  est: 'Estonian',
  eus: 'Basque',
  fao: 'Faroese',
  fas: 'Persian',
  fil: 'Filipino (old - Tagalog)',
  fin: 'Finnish',
  fra: 'French',
  frk: 'German - Fraktur',
  frm: 'French, Middle (ca.1400-1600)',
  fry: 'Western Frisian',
  gla: 'Scottish Gaelic',
  gle: 'Irish',
  glg: 'Galician',
  grc: 'Greek, Ancient (to 1453) (contrib)',
  guj: 'Gujarati',
  hat: 'Haitian; Haitian Creole',
  heb: 'Hebrew',
  hin: 'Hindi',
  hrv: 'Croatian',
  hun: 'Hungarian',
  hye: 'Armenian',
  iku: 'Inuktitut',
  ind: 'Indonesian',
  isl: 'Icelandic',
  ita: 'Italian',
  ita_old: 'Italian - Old',
  jav: 'Javanese',
  jpn: 'Japanese',
  kan: 'Kannada',
  kat: 'Georgian',
  kat_old: 'Georgian - Old',
  kaz: 'Kazakh',
  khm: 'Central Khmer',
  kir: 'Kirghiz; Kyrgyz',
  kmr: 'Kurmanji (Kurdish - Latin Script)',
  kor: 'Korean',
  kor_vert: 'Korean (vertical)',
  kur: 'Kurdish (Arabic Script)',
  lao: 'Lao',
  lat: 'Latin',
  lav: 'Latvian',
  lit: 'Lithuanian',
  ltz: 'Luxembourgish',
  mal: 'Malayalam',
  mar: 'Marathi',
  mkd: 'Macedonian',
  mlt: 'Maltese',
  mon: 'Mongolian',
  mri: 'Maori',
  msa: 'Malay',
  mya: 'Burmese',
  nep: 'Nepali',
  nld: 'Dutch; Flemish',
  nor: 'Norwegian',
  oci: 'Occitan (post 1500)',
  ori: 'Oriya',
  osd: 'Orientation and script detection module',
  pan: 'Panjabi; Punjabi',
  pol: 'Polish',
  por: 'Portuguese',
  pus: 'Pushto; Pashto',
  que: 'Quechua',
  ron: 'Romanian; Moldavian; Moldovan',
  rus: 'Russian',
  san: 'Sanskrit',
  sin: 'Sinhala; Sinhalese',
  slk: 'Slovak',
  slk_frak: 'Slovak - Fraktur (contrib)',
  slv: 'Slovenian',
  snd: 'Sindhi',
  spa: 'Spanish; Castilian',
  spa_old: 'Spanish; Castilian - Old',
  sqi: 'Albanian',
  srp: 'Serbian',
  srp_latn: 'Serbian - Latin',
  sun: 'Sundanese',
  swa: 'Swahili',
  swe: 'Swedish',
  syr: 'Syriac',
  tam: 'Tamil',
  tat: 'Tatar',
  tel: 'Telugu',
  tgk: 'Tajik',
  tgl: 'Tagalog (new - Filipino)',
  tha: 'Thai',
  tir: 'Tigrinya',
  ton: 'Tonga',
  tur: 'Turkish',
  uig: 'Uighur; Uyghur',
  ukr: 'Ukrainian',
  urd: 'Urdu',
  uzb: 'Uzbek',
  uzb_cyrl: 'Uzbek - Cyrilic',
  vie: 'Vietnamese',
  yid: 'Yiddish',
  yor: 'Yoruba'
 };

 // create abbreviation list of languages
 const abbrLangs = {};
 for (const lang in langs) {
  abbrLangs[lang] = lang;
 }

 export async function getTextFromImage (lang = 'eng', image) {
  lang = lang && lang.length > 0 ? abbrLangs[lang] || 'eng' : 'eng';

  let text;
  const client = createOCRClient(); // initialize tesseract
  const modelURL = `https://github.com/tesseract-ocr/tessdata_fast/raw/main/${lang}.traineddata`;

  try {
    const model = await got(modelURL).buffer(); // get model

    // check if a url or local file and get buffer
    const buffer = await (image.startsWith('http')
      ? got(image).buffer()
      : readFile(resolve(__dirname, image)));

    const img = await sharp(buffer).ensureAlpha(); // improve image quality fro better readability
    const { width, height } = await img.metadata();
    const data = await img.raw().toBuffer();

    await client.loadModel(model); // load model
    // load image
    await client.loadImage({
      data,
      width,
      height
    });

    text = await client.getText(); // get text from image
  } catch (e) {
  } finally {
    client.destroy(); // destroy tesseract process
  }
  return text;
 }
	import { Telegraf } from 'telegraf'
	import { getTextFromImage, langs } from './ocr'

	const bot = new Telegraf(process.env.BOT_TOKEN)

	bot.command('ocr', async ctx => {
	const reply_to = ctx.message.reply_to_message
	const query = ctx.message.text.replce('/ocr', '').trim()

	if (query === 'langs') {
	let list = '';
	for (const lng in langs) {
	list += `\u2014 <b>${lng}</b>: ${langs[lng]}\n`;
	}
	return ctx.replyWithHTML(`List of available languages:\n\n${list}`)
	}

	if (!langs[query]) {
	return ctx.replyWithHTML(`<b>Language not found</b>\nSend <code>/ocr langs</code> to see a list of available languages`);
	}

	if (reply_to && reply_to.photo) { // check if the replied message contains an image or photo
	const { file_id } = reply_to.photo.pop(); // get the best image quality
	const url = await ctx.telegram.getFileLink(file_id); // get file link to download in memory
	const text = await getTextFromImage(ctx.from.language_code, url); // get text from image
	// send reply with image text
	ctx.reply(text, {
	reply_to_message_id: reply_to.message_id,
	allow_sending_without_reply: true
	})
	}
	})

	bot.launch() // start bot
	import { resolve } from 'node:path';
	import { readFile } from 'node:fs/promises';
	import { createOCRClient } from 'tesseract-wasm/node';
	import sharp from 'sharp';
	import got from 'got';

	// list of available languages
	export const langs = {
	afr: 'Afrikaans',
	amh: 'Amharic',
	ara: 'Arabic',
	asm: 'Assamese',
	aze: 'Azerbaijani',
	aze_cyrl: 'Azerbaijani - Cyrilic',
	bel: 'Belarusian',
	ben: 'Bengali',
	bod: 'Tibetan',
	bos: 'Bosnian',
	bre: 'Breton',
	bul: 'Bulgarian',
	cat: 'Catalan; Valencian',
	ceb: 'Cebuano',
	ces: 'Czech',
	chi_sim: 'Chinese - Simplified',
	chi_tra: 'Chinese - Traditional',
	chr: 'Cherokee',
	cos: 'Corsican',
	cym: 'Welsh',
	dan: 'Danish',
	dan_frak: 'Danish - Fraktur (contrib)',
	deu: 'German',
	deu_frak: 'German - Fraktur (contrib)',
	dzo: 'Dzongkha',
	ell: 'Greek, Modern (1453-)',
	eng: 'English',
	enm: 'English, Middle (1100-1500)',
	epo: 'Esperanto',
	equ: 'Math / equation detection module',
	est: 'Estonian',
	eus: 'Basque',
	fao: 'Faroese',
	fas: 'Persian',
	fil: 'Filipino (old - Tagalog)',
	fin: 'Finnish',
	fra: 'French',
	frk: 'German - Fraktur',
	frm: 'French, Middle (ca.1400-1600)',
	fry: 'Western Frisian',
	gla: 'Scottish Gaelic',
	gle: 'Irish',
	glg: 'Galician',
	grc: 'Greek, Ancient (to 1453) (contrib)',
	guj: 'Gujarati',
	hat: 'Haitian; Haitian Creole',
	heb: 'Hebrew',
	hin: 'Hindi',
	hrv: 'Croatian',
	hun: 'Hungarian',
	hye: 'Armenian',
	iku: 'Inuktitut',
	ind: 'Indonesian',
	isl: 'Icelandic',
	ita: 'Italian',
	ita_old: 'Italian - Old',
	jav: 'Javanese',
	jpn: 'Japanese',
	kan: 'Kannada',
	kat: 'Georgian',
	kat_old: 'Georgian - Old',
	kaz: 'Kazakh',
	khm: 'Central Khmer',
	kir: 'Kirghiz; Kyrgyz',
	kmr: 'Kurmanji (Kurdish - Latin Script)',
	kor: 'Korean',
	kor_vert: 'Korean (vertical)',
	kur: 'Kurdish (Arabic Script)',
	lao: 'Lao',
	lat: 'Latin',
	lav: 'Latvian',
	lit: 'Lithuanian',
	ltz: 'Luxembourgish',
	mal: 'Malayalam',
	mar: 'Marathi',
	mkd: 'Macedonian',
	mlt: 'Maltese',
	mon: 'Mongolian',
	mri: 'Maori',
	msa: 'Malay',
	mya: 'Burmese',
	nep: 'Nepali',
	nld: 'Dutch; Flemish',
	nor: 'Norwegian',
	oci: 'Occitan (post 1500)',
	ori: 'Oriya',
	osd: 'Orientation and script detection module',
	pan: 'Panjabi; Punjabi',
	pol: 'Polish',
	por: 'Portuguese',
	pus: 'Pushto; Pashto',
	que: 'Quechua',
	ron: 'Romanian; Moldavian; Moldovan',
	rus: 'Russian',
	san: 'Sanskrit',
	sin: 'Sinhala; Sinhalese',
	slk: 'Slovak',
	slk_frak: 'Slovak - Fraktur (contrib)',
	slv: 'Slovenian',
	snd: 'Sindhi',
	spa: 'Spanish; Castilian',
	spa_old: 'Spanish; Castilian - Old',
	sqi: 'Albanian',
	srp: 'Serbian',
	srp_latn: 'Serbian - Latin',
	sun: 'Sundanese',
	swa: 'Swahili',
	swe: 'Swedish',
	syr: 'Syriac',
	tam: 'Tamil',
	tat: 'Tatar',
	tel: 'Telugu',
	tgk: 'Tajik',
	tgl: 'Tagalog (new - Filipino)',
	tha: 'Thai',
	tir: 'Tigrinya',
	ton: 'Tonga',
	tur: 'Turkish',
	uig: 'Uighur; Uyghur',
	ukr: 'Ukrainian',
	urd: 'Urdu',
	uzb: 'Uzbek',
	uzb_cyrl: 'Uzbek - Cyrilic',
	vie: 'Vietnamese',
	yid: 'Yiddish',
	yor: 'Yoruba'
	};

	// create abbreviation list of languages
	const abbrLangs = {};
	for (const lang in langs) {
	abbrLangs[lang] = lang;
	}

	export async function getTextFromImage (lang = 'eng', image) {
	lang = lang && lang.length > 0 ? abbrLangs[lang] \|\| 'eng' : 'eng';

	let text;
	const client = createOCRClient(); // initialize tesseract
	const modelURL = `https://github.com/tesseract-ocr/tessdata_fast/raw/main/${lang}.traineddata`;

	try {
	const model = await got(modelURL).buffer(); // get model

	// check if a url or local file and get buffer
	const buffer = await (image.startsWith('http')
	? got(image).buffer()
	: readFile(resolve(__dirname, image)));

	const img = await sharp(buffer).ensureAlpha(); // improve image quality fro better readability
	const { width, height } = await img.metadata();
	const data = await img.raw().toBuffer();

	await client.loadModel(model); // load model
	// load image
	await client.loadImage({
	data,
	width,
	height
	});

	text = await client.getText(); // get text from image
	} catch (e) {
	} finally {
	client.destroy(); // destroy tesseract process
	}
	return text;
	}