Skip to content

Instantly share code, notes, and snippets.

@armanokka
Created November 29, 2022 05:54
Show Gist options
  • Save armanokka/1071f9d7f17b4a6599a9d5f7cdf6d932 to your computer and use it in GitHub Desktop.
Save armanokka/1071f9d7f17b4a6599a9d5f7cdf6d932 to your computer and use it in GitHub Desktop.
Apply entities HTML | Convert telegram entities to HTML | Golang
import (
"github.com/go-telegram-bot-api/telegram-bot-api"
"unicode/utf16"
"golang.org/x/text/unicode/norm"
)
func indexDelim(text []uint16, limit int, delims string) (offset int) {
if len(text) < limit {
return len(text)
} else if len(text) > limit {
text = text[:limit]
}
offset = len(text)
delimeters := utf16.Encode([]rune(delims))
for i := len(text) - 1; i >= 0; i-- {
if in(delimeters, text[i]) {
return i + 1
}
}
return offset
}
// SplitIntoChunksBySentences. You can merge output by ""
func SplitIntoChunksBySentences(text string, limit int) []string {
if len(text) < limit {
return []string{text}
}
chunks := make([]string, 0, len(text)/limit+1)
points := utf16.Encode([]rune(text))
for i := 0; i < len(points); {
offset := indexDelim(points[i:], limit, ".!?;\r\n\t\f\v*)")
ch := string(utf16.Decode(points[i : i+offset]))
chunks = append(chunks, ch)
i += offset
}
return chunks
}
// ApplyEntitiesHtml adds <notranslate></notranslate> to some types of entities
func ApplyEntitiesHtml(text string, entities []tgbotapi.MessageEntity, messageLengthLimit int) []string {
chunks := SplitIntoChunksBySentences(text, messageLengthLimit)
if len(entities) == 0 {
for i, chunk := range chunks {
chunks[i] = html.EscapeString(chunk)
}
return chunks
}
var chunkOffset int
for i := 0; i < len(chunks); i += 1 {
chunk := utf16.Encode([]rune(chunks[i]))
pointers := make(map[int]string)
for _, entity := range entities {
entityStart := entity.Offset
entityEnd := entityStart + entity.Length
if entityEnd < chunkOffset || entityStart > chunkOffset+len(chunk) {
continue
}
var before, after string
switch entity.Type {
case "code", "pre":
before, after = `<notranslate><code>`, `</code></notranslate>`
case "bold":
before, after = `<b>`, `</b>`
case "italic":
before, after = `<i>`, `</i>`
case "underline":
before, after = `<u>`, `</u>`
case "strikethrough":
before, after = `<s>`, `</s>`
case "text_link":
before, after = `<notranslate><a href="`+entity.URL+`">`, `</a></notranslate>`
case "text_mention":
before, after = `<notranslate><a href="tg://user?id=`+strconv.FormatInt(entity.User.ID, 10)+`">`, `</a></notranslate>`
case "spoiler":
before, after = "<span class=\"tg-spoiler\">", "</span>"
case "mention", "hashtag", "cashtag", "bot_command", "url", "email", "phone_number", "custom_emoji":
before, after = "<notranslate>", "</notranslate>"
}
//pointers[entity.Offset] += before
//pointers[entity.Offset+entity.Length] = after + pointers[entity.Offset+entity.Length]
// All scenarios:
// 1. tag starts before chunk and ends before/in/after chunk
// 2. tag starts in chunk and ends in/after chunk
// 3. tag starts after chunk and ends after chunk
// Scenarios we have to handle:
// 1. tag ends after chunk [*]
// 2. tag starts before chunk and ends after chunk
if entityStart < chunkOffset {
pointers[0] += before
} else {
pointers[entityStart-chunkOffset] += before
}
if entityEnd > chunkOffset+len(chunk) {
pointers[len(chunk)] += after
} else {
pointers[entityEnd-chunkOffset] += after
}
}
var out = make([]uint16, 0, len(chunk))
for i, ch := range chunk {
if m, ok := pointers[i]; ok {
out = append(out, utf16.Encode([]rune(m))...)
}
if escaped, ok := htmlEscape[ch]; ok {
out = append(out, escaped...)
} else {
out = append(out, ch)
}
}
if m, ok := pointers[len(chunk)]; ok {
out = append(out, utf16.Encode([]rune(m))...)
}
chunks[i] = norm.NFKC.String(strings.ReplaceAll(string(utf16.Decode(out)), "<br>", "\n"))
chunkOffset += len(chunk)
}
return chunks
}
@apepenkov
Copy link

apepenkov commented Apr 12, 2024

your code has a problem - it doesn't close the tags in required order. So if 2 entities have the same bounds, it will do
"<b><u>You down to ride or what? </b></u>"
instead of
"<b><u>You down to ride or what? </u></b>"
Revised code (I also removed notranslate, as it was causing some problems):

package entities_html_unparsing

import (
	tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
	"golang.org/x/text/unicode/norm"
	"html"
	"strconv"
	"strings"
	"unicode/utf16"
)

// YOINKED from https://gist.github.com/armanokka/1071f9d7f17b4a6599a9d5f7cdf6d932
// ty, kind stranger
// I'd go mad if I had to write this myself (I tried)

var htmlEscape = map[uint16][]uint16{
	'<': utf16.Encode([]rune("&lt;")),
	'>': utf16.Encode([]rune("&gt;")),
	'&': utf16.Encode([]rune("&amp;")),
}

func in(arr []uint16, val uint16) bool {
	for _, v := range arr {
		if v == val {
			return true
		}
	}
	return false
}

func indexDelim(text []uint16, limit int, delims string) (offset int) {
	if len(text) < limit {
		return len(text)
	} else if len(text) > limit {
		text = text[:limit]
	}
	offset = len(text)
	delimeters := utf16.Encode([]rune(delims))
	for i := len(text) - 1; i >= 0; i-- {
		if in(delimeters, text[i]) {
			return i + 1
		}
	}
	return offset
}

// SplitIntoChunksBySentences. You can merge output by ""
func SplitIntoChunksBySentences(text string, limit int) []string {
	if len(text) < limit {
		return []string{text}
	}
	chunks := make([]string, 0, len(text)/limit+1)
	points := utf16.Encode([]rune(text))
	for i := 0; i < len(points); {
		offset := indexDelim(points[i:], limit, ".!?;\r\n\t\f\v*)")
		ch := string(utf16.Decode(points[i : i+offset]))
		chunks = append(chunks, ch)
		i += offset
	}
	return chunks
}

// ApplyEntitiesHtml ~~adds </notranslate>~~ to some types of entities
func ApplyEntitiesHtml(text string, entities []tgbotapi.MessageEntity, messageLengthLimit int) []string {
	chunks := SplitIntoChunksBySentences(text, messageLengthLimit)
	if len(entities) == 0 {
		for i, chunk := range chunks {
			chunks[i] = html.EscapeString(chunk)
		}
		return chunks
	}

	var chunkOffset int
	for i := 0; i < len(chunks); i += 1 {
		chunk := utf16.Encode([]rune(chunks[i]))
		pointers := make(map[int][]string)
		for _, entity := range entities {
			entityStart := entity.Offset
			entityEnd := entityStart + entity.Length
			if entityEnd < chunkOffset || entityStart > chunkOffset+len(chunk) {
				continue
			}
			var before, after string
			switch entity.Type {
			case "code", "pre":
				before, after = `<code>`, `</code>`
			case "bold":
				before, after = `<b>`, `</b>`
			case "italic":
				before, after = `<i>`, `</i>`
			case "underline":
				before, after = `<u>`, `</u>`
			case "strikethrough":
				before, after = `<s>`, `</s>`
			case "text_link":
				before, after = `<a href="`+entity.URL+`">`, `</a>`
			case "text_mention":
				before, after = `<a href="tg://user?id=`+strconv.FormatInt(entity.User.ID, 10)+`">`, `</a>`
			case "spoiler":
				before, after = "<span class=\"tg-spoiler\">", "</span>"
			default:
				before, after = "", ""
			}

			if entityStart < chunkOffset {
				pointers[0] = append(pointers[0], before)
			} else {
				pointers[entityStart-chunkOffset] = append(pointers[entityStart-chunkOffset], before)
			}
			if entityEnd > chunkOffset+len(chunk) {
				pointers[len(chunk)] = append([]string{after}, pointers[len(chunk)]...)
			} else {
				pointers[entityEnd-chunkOffset] = append([]string{after}, pointers[entityEnd-chunkOffset]...)
			}
		}

		var out []uint16
		for j := 0; j < len(chunk); j++ {
			if tags, ok := pointers[j]; ok {
				for _, tag := range tags {
					out = append(out, utf16.Encode([]rune(tag))...)
				}
			}
			if escaped, ok := htmlEscape[chunk[j]]; ok {
				out = append(out, escaped...)
			} else {
				out = append(out, chunk[j])
			}
		}
		if tags, ok := pointers[len(chunk)]; ok {
			for _, tag := range tags {
				out = append(out, utf16.Encode([]rune(tag))...)
			}
		}
		chunks[i] = norm.NFKC.String(strings.ReplaceAll(string(utf16.Decode(out)), "<br>", "\n"))
		chunkOffset += len(chunk)
	}
	return chunks
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment