Skip to content

Instantly share code, notes, and snippets.

@chuv1
Last active November 18, 2023 09:37
Show Gist options
  • Save chuv1/9641abc8a5a1a9b3bb8c9177fb7ffa9e to your computer and use it in GitHub Desktop.
Save chuv1/9641abc8a5a1a9b3bb8c9177fb7ffa9e to your computer and use it in GitHub Desktop.
Test function to convert telegram message entities into HTML markup.
public function getHTML($without_cmd = false){
if(empty($this->getEntities())){
return $this->getText($without_cmd);
}
$text = $this->getText();
$html = '';
$entities_count = \count($this->getEntities())-1;
foreach($this->getEntities() as $k => $entity){
if($k === 0){
$html .= mb_substr($text, 0, $entity->getOffset());
}
switch($entity->getType()){
default:
case 'mention':
case 'hashtag':
case 'cashtag':
case 'bot_command':
case 'url':
case 'email':
case 'phone_number':
$html .= mb_substr($text, $entity->getOffset(), $entity->getLength());
break;
case 'text_mention':
$html .= '<a href="tg://user?id='.$entity->getUser()->getId().'">'.mb_substr($text, $entity->getOffset(), $entity->getLength()).'</a>';
break;
case 'text_link':
$html .= '<a href="'.$entity->getUrl().'">'.mb_substr($text, $entity->getOffset(), $entity->getLength()).'</a>';
break;
case 'bold':
$html .= '<b>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</b>';
break;
case 'italic':
$html .= '<i>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</i>';
break;
case 'code':
$html .= '<code>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</code>';
break;
case 'pre':
$html .= '<pre>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</pre>';
break;
}
if($k === $entities_count){
$html .= mb_substr($text, $entity->getOffset() + $entity->getLength());
}
}
return $html;
}
@rozhok
Copy link

rozhok commented Mar 13, 2021

This won't work when you have, for instance, bold link.

@rozhok
Copy link

rozhok commented Mar 14, 2021

Also it won't work if message contains emojis, because offsets are specified in UTF-16 format. Here ruby implementation which should handle all corner cases:

  def to_html(message)
    start_tags = {}
    end_tags = {}
    if message.key?("entities")
      start_tags = message["entities"].group_by {|e| e["offset"] * 2}
      end_tags = message["entities"].group_by {|e| (e["offset"] + e["length"]) * 2}
    end
    html = []
    text = String.new(message["text"]).encode("UTF-16").bytes[2..]
    text.each_with_index do |c, i|
      if start_tags.key?(i)
        start_tags[i].each do |e|
          html.push *start_tag_to_text(e, text).encode("UTF-16").bytes[2..]
        end
      end
      html << c
      if end_tags.key?(i + 1)
        end_tags[i + 1].reverse.each do |e|
          html.push *end_tag_to_text(e).encode("UTF-16").bytes[2..]
        end
      end
    end
    # UTF-16 BOM
    html.insert(0, 254, 255)
    puts("=== RESULT ===")
    puts(html.pack("C*").force_encoding("UTF-16").encode("UTF-8"))
    html
  end

  def start_tag_to_text(tag, text)
    case tag["type"]
    when "bold"
      "<b>"
    when "italic"
      "<i>"
    when "underline"
      "<u>"
    when "strikethrough"
      "<s>"
    when "code"
      "<code>"
    when "pre"
      "<pre>"
    when "text_link"
      "<a href=\"#{tag["url"]}\">"
    when "mention"
      "<a href=\"https://t.me/#{text[tag["offset"] + 1..tag["offset"] + tag["length"]]}\">"
    when "url"
      "<a href=\"#{text[tag["offset"]..tag["offset"] + tag["length"] - 1]}\">"
    else
      ""
    end
  end

  def end_tag_to_text(tag)
    tags = {
        "bold" => "</b>",
        "italic" => "</i>",
        "underline" => "</u>",
        "strikethrough" => "</s>",
        "code" => "</code>",
        "pre" => "</pre>",
        "text_link" => "</a>",
        "mention" => "</a>",
        "url" => "</a>"
    }
    tags.fetch(tag["type"], "")
  end

@roymckenzie
Copy link

roymckenzie commented Jun 27, 2022

I was looking for a javascript implementation of this code. Thanks to your example I was able to develop a basic version. Thank you!

const parseTelegramMessage = function (telegramData) {
  const text = telegramData.message.text || telegramData.message.caption
  const entities = telegramData.message.entities || telegramData.message.caption_entities

  if (!entities) {
    return text
  }

  let html = ''

  entities.forEach((entity, index) => {
    // Characters before entity
    if (index === 0) {
      html += text.slice(0, entity.offset)
    }

    // Handle entity transformation
    const entityText = text.slice(entity.offset, entity.offset + entity.length)

    switch (entity.type) {
      case 'bold':
        html += `<strong>${entityText}</strong>`
        break
      case 'text_link':
        html += `<a href="${entity.url}" target="_blank">${entityText}</a>`
        break
      case 'url':
        html += `<a href="${entityText}" target="_blank">${entityText}</a>`
        break
      case 'italic':
        html += `<em>${entityText}</em>`
        break
      case 'mention':
        html += `<a href="https://t.me/${entityText.replace('@', '')}" target="_blank">${entityText}</a>`
        break
      case 'email':
        html += `<a href="mailto:${entityText}">${entityText}</a>`
        break
      case 'phone_number':
        html += `<a href="tel:${entityText}">${entityText}</a>`
        break
      default:
        html += `${entityText}`
    }

    // Characters after entity but before next entity
    if (entities.length > index + 1) {
      html += text.slice(entity.offset + entity.length, entities[index + 1].offset)
    }

    // Last characters after last entity
    if (entities.length === index + 1) {
      html += text.slice(entity.offset + entity.length)
    }
  })

  return html
}

@azkadev
Copy link

azkadev commented Sep 8, 2022

Wow nice can someone give me an example of how to string messages to telegram entities? in javascript

@grischka
Copy link

grischka commented Dec 1, 2022

This is awesome and helped me a lot. Unfortunately, it causes problems with nested entities.

Example: a URL that should be displayed bold. Currently, it's being parsed like this
<a href="...">URL</a><strong>URL</strong>
It should be this
<a href="..."><strong>URL</strong></a>

Any ideas how to improve this?

@LeonidShastel
Copy link

Rewrote the code, now you can use multiple styles and attach links

const parseTelegramMessage = (telegramData) => {
    const text = telegramData.message.text || telegramData.message.caption
    const entities = telegramData.message.entities || telegramData.message.caption_entities

    if (!entities) {
        return text
    }

    let tags = [];

    entities.forEach((entity) => {
        const startTag = getTag(entity, text);
        let searchTag = tags.filter(tag=>tag.index===entity.offset)
        if (searchTag.length>0)
            searchTag[0].tag+=startTag;
        else
            tags.push({
                index: entity.offset,
                tag: startTag
            });

        const closeTag = startTag.indexOf("<a ")===0 ? "</a>" : "</"+startTag.slice(1);
        searchTag = tags.filter(tag=>tag.index===entity.offset+entity.length)
        if (searchTag.length>0)
            searchTag[0].tag = closeTag+searchTag[0].tag;
        else
            tags.push({
                index: entity.offset+entity.length,
                tag: closeTag
            })
    })
    let html = "";
    for (let i = 0; i<text.length; i++){
        const tag = tags.filter(tag=>tag.index===i);
        tags = tags.filter(tag=>tag.index!==i);
        if (tag.length>0)
            html+=tag[0].tag;
        html+=text[i];
    }
    if (tags.length>0)
        html+=tags[0].tag

    return html;
}

const getTag = (entity, text) => {
    const entityText = text.slice(entity.offset, entity.offset + entity.length)

    switch (entity.type) {
        case 'bold':
            return `<strong>`
        case 'text_link':
            return `<a href="${entity.url}" target="_blank">`
        case 'url':
            return `<a href="${entityText}" target="_blank">`
        case 'italic':
            return `<em>`
        case "code":
            return `<code>`
        case "strikethrough":
            return `<s>`
        case "underline":
            return `<u>`
        case "pre":
            return `<pre>`
        case 'mention':
            return `<a href="https://t.me/${entityText.replace('@', '')}" target="_blank">`
        case 'email':
            return `<a href="mailto:${entityText}">`
        case 'phone_number':
            return `<a href="tel:${entityText}">`
    }
}

@survtur
Copy link

survtur commented Dec 2, 2022

Guys, your codes gave me basic ideas and I made my own Python3 solution. It works with nested entities too.

https://github.com/survtur/TelethonMessageToHtmlConverter

@ak4zh
Copy link

ak4zh commented Dec 19, 2022

Thanks, @LeonidShastel , used your code and added typescript types to use with grammY

import type { Context } from 'grammy';
import type { MessageEntity } from 'grammy/out/types.node';

export const parseTelegramMessage = (ctx: Context) => {
	const text = ctx.msg?.text;
	const entities = ctx.msg?.entities;

	if (!entities || !text) {
		return text;
	}

	let tags: { index: number; tag: string | undefined }[] = [];

	entities.forEach((entity) => {
		const startTag = getTag(entity, text);
		let searchTag = tags.filter((tag) => tag.index === entity.offset);
		if (searchTag.length > 0 && startTag) searchTag[0].tag += startTag;
		else
			tags.push({
				index: entity.offset,
				tag: startTag
			});

		const closeTag = startTag?.indexOf('<a ') === 0 ? '</a>' : '</' + startTag?.slice(1);
		searchTag = tags.filter((tag) => tag.index === entity.offset + entity.length);
		if (searchTag.length > 0) searchTag[0].tag = closeTag + searchTag[0].tag;
		else
			tags.push({
				index: entity.offset + entity.length,
				tag: closeTag
			});
	});
	let html = '';
	for (let i = 0; i < text.length; i++) {
		const tag = tags.filter((tag) => tag.index === i);
		tags = tags.filter((tag) => tag.index !== i);
		if (tag.length > 0) html += tag[0].tag;
		html += text[i];
	}
	if (tags.length > 0) html += tags[0].tag;

	return html;
};

const getTag = (entity: MessageEntity, text: string) => {
	const entityText = text.slice(entity.offset, entity.offset + entity.length);

	switch (entity.type) {
		case 'bold':
			return `<strong>`;
		case 'text_link':
			return `<a href="${entity.url}" target="_blank">`;
		case 'url':
			return `<a href="${entityText}" target="_blank">`;
		case 'italic':
			return `<em>`;
		case 'code':
			return `<code>`;
		case 'strikethrough':
			return `<s>`;
		case 'underline':
			return `<u>`;
		case 'pre':
			return `<pre>`;
		case 'mention':
			return `<a href="https://t.me/${entityText.replace('@', '')}" target="_blank">`;
		case 'email':
			return `<a href="mailto:${entityText}">`;
		case 'phone_number':
			return `<a href="tel:${entityText}">`;
	}
};

@kvalood
Copy link

kvalood commented Jul 19, 2023

Function for @mtproto/core, for parsing messages

export const parseTelegramMessage = function (msg) {
	const text = msg.message || msg.caption;
	const entities = msg.entities || msg.caption_entities;

	if (!entities) {
		return text;
	}

	let html = "";

	entities.forEach((entity, index) => {
		// Characters before entity
		if (index === 0) {
			html += text.slice(0, entity.offset);
		}

		// Handle entity transformation
		const entityText = text.slice(
			entity.offset,
			entity.offset + entity.length
		);

		switch (entity._) {
			case "messageEntityBold":
				html += `<strong>${entityText}</strong>`;
				break;
			case "messageEntityPre":
				html += `<pre>${entityText}</pre>`;
				break;
			case "messageEntityCode":
				html += `<code>${entityText}</code>`;
				break;
			case "messageEntityStrike":
				html += `<s>${entityText}</s>`;
				break;
			case "messageEntityUnderline":
				html += `<u>${entityText}</u>`;
				break;
			case "messageEntitySpoiler":
				html += `<span class="tg-spoiler">${entityText}</span>`;
				break;
            case "messageEntityUrl":
			case "messageEntityTextUrl":
				html += `<a href="${entity.url}" target="_blank">${entityText}</a>`;
				break;
			case "messageEntityItalic":
				html += `<em>${entityText}</em>`;
				break;
			case "messageEntityMention":
				html += `<a href="https://t.me/${entityText.replace(
					"@",
					""
				)}" target="_blank">${entityText}</a>`;
				break;
			case "messageEntityEmail":
				html += `<a href="mailto:${entityText}">${entityText}</a>`;
				break;
			case "messageEntityPhone":
				html += `<a href="tel:${entityText}">${entityText}</a>`;
				break;
			default:
				html += `${entityText}`;
		}

		// Characters after entity but before next entity
		if (entities.length > index + 1) {
			html += text.slice(
				entity.offset + entity.length,
				entities[index + 1].offset
			);
		}

		// Last characters after last entity
		if (entities.length === index + 1) {
			html += text.slice(entity.offset + entity.length);
		}
	});

	return html;
};

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment