Created
November 29, 2022 05:54
-
-
Save armanokka/1071f9d7f17b4a6599a9d5f7cdf6d932 to your computer and use it in GitHub Desktop.
Apply entities HTML | Convert telegram entities to HTML | Golang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ( | |
"github.com/go-telegram-bot-api/telegram-bot-api" | |
"unicode/utf16" | |
"golang.org/x/text/unicode/norm" | |
) | |
func indexDelim(text []uint16, limit int, delims string) (offset int) { | |
if len(text) < limit { | |
return len(text) | |
} else if len(text) > limit { | |
text = text[:limit] | |
} | |
offset = len(text) | |
delimeters := utf16.Encode([]rune(delims)) | |
for i := len(text) - 1; i >= 0; i-- { | |
if in(delimeters, text[i]) { | |
return i + 1 | |
} | |
} | |
return offset | |
} | |
// SplitIntoChunksBySentences. You can merge output by "" | |
func SplitIntoChunksBySentences(text string, limit int) []string { | |
if len(text) < limit { | |
return []string{text} | |
} | |
chunks := make([]string, 0, len(text)/limit+1) | |
points := utf16.Encode([]rune(text)) | |
for i := 0; i < len(points); { | |
offset := indexDelim(points[i:], limit, ".!?;\r\n\t\f\v*)") | |
ch := string(utf16.Decode(points[i : i+offset])) | |
chunks = append(chunks, ch) | |
i += offset | |
} | |
return chunks | |
} | |
// ApplyEntitiesHtml adds <notranslate></notranslate> to some types of entities | |
func ApplyEntitiesHtml(text string, entities []tgbotapi.MessageEntity, messageLengthLimit int) []string { | |
chunks := SplitIntoChunksBySentences(text, messageLengthLimit) | |
if len(entities) == 0 { | |
for i, chunk := range chunks { | |
chunks[i] = html.EscapeString(chunk) | |
} | |
return chunks | |
} | |
var chunkOffset int | |
for i := 0; i < len(chunks); i += 1 { | |
chunk := utf16.Encode([]rune(chunks[i])) | |
pointers := make(map[int]string) | |
for _, entity := range entities { | |
entityStart := entity.Offset | |
entityEnd := entityStart + entity.Length | |
if entityEnd < chunkOffset || entityStart > chunkOffset+len(chunk) { | |
continue | |
} | |
var before, after string | |
switch entity.Type { | |
case "code", "pre": | |
before, after = `<notranslate><code>`, `</code></notranslate>` | |
case "bold": | |
before, after = `<b>`, `</b>` | |
case "italic": | |
before, after = `<i>`, `</i>` | |
case "underline": | |
before, after = `<u>`, `</u>` | |
case "strikethrough": | |
before, after = `<s>`, `</s>` | |
case "text_link": | |
before, after = `<notranslate><a href="`+entity.URL+`">`, `</a></notranslate>` | |
case "text_mention": | |
before, after = `<notranslate><a href="tg://user?id=`+strconv.FormatInt(entity.User.ID, 10)+`">`, `</a></notranslate>` | |
case "spoiler": | |
before, after = "<span class=\"tg-spoiler\">", "</span>" | |
case "mention", "hashtag", "cashtag", "bot_command", "url", "email", "phone_number", "custom_emoji": | |
before, after = "<notranslate>", "</notranslate>" | |
} | |
//pointers[entity.Offset] += before | |
//pointers[entity.Offset+entity.Length] = after + pointers[entity.Offset+entity.Length] | |
// All scenarios: | |
// 1. tag starts before chunk and ends before/in/after chunk | |
// 2. tag starts in chunk and ends in/after chunk | |
// 3. tag starts after chunk and ends after chunk | |
// Scenarios we have to handle: | |
// 1. tag ends after chunk [*] | |
// 2. tag starts before chunk and ends after chunk | |
if entityStart < chunkOffset { | |
pointers[0] += before | |
} else { | |
pointers[entityStart-chunkOffset] += before | |
} | |
if entityEnd > chunkOffset+len(chunk) { | |
pointers[len(chunk)] += after | |
} else { | |
pointers[entityEnd-chunkOffset] += after | |
} | |
} | |
var out = make([]uint16, 0, len(chunk)) | |
for i, ch := range chunk { | |
if m, ok := pointers[i]; ok { | |
out = append(out, utf16.Encode([]rune(m))...) | |
} | |
if escaped, ok := htmlEscape[ch]; ok { | |
out = append(out, escaped...) | |
} else { | |
out = append(out, ch) | |
} | |
} | |
if m, ok := pointers[len(chunk)]; ok { | |
out = append(out, utf16.Encode([]rune(m))...) | |
} | |
chunks[i] = norm.NFKC.String(strings.ReplaceAll(string(utf16.Decode(out)), "<br>", "\n")) | |
chunkOffset += len(chunk) | |
} | |
return chunks | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
your code has a problem - it doesn't close the tags in required order. So if 2 entities have the same bounds, it will do
"<b><u>You down to ride or what? </b></u>"
instead of
"<b><u>You down to ride or what? </u></b>"
Revised code (I also removed notranslate, as it was causing some problems):