chuv1/tgEntitiesToHTML.php

rozhok · 2021-03-13T16:47:13Z

This won't work when you have, for instance, bold link.

rozhok · 2021-03-14T23:06:32Z

Also it won't work if message contains emojis, because offsets are specified in UTF-16 format. Here ruby implementation which should handle all corner cases:

  def to_html(message)
    start_tags = {}
    end_tags = {}
    if message.key?("entities")
      start_tags = message["entities"].group_by {|e| e["offset"] * 2}
      end_tags = message["entities"].group_by {|e| (e["offset"] + e["length"]) * 2}
    end
    html = []
    text = String.new(message["text"]).encode("UTF-16").bytes[2..]
    text.each_with_index do |c, i|
      if start_tags.key?(i)
        start_tags[i].each do |e|
          html.push *start_tag_to_text(e, text).encode("UTF-16").bytes[2..]
        end
      end
      html << c
      if end_tags.key?(i + 1)
        end_tags[i + 1].reverse.each do |e|
          html.push *end_tag_to_text(e).encode("UTF-16").bytes[2..]
        end
      end
    end
    # UTF-16 BOM
    html.insert(0, 254, 255)
    puts("=== RESULT ===")
    puts(html.pack("C*").force_encoding("UTF-16").encode("UTF-8"))
    html
  end

  def start_tag_to_text(tag, text)
    case tag["type"]
    when "bold"
      "<b>"
    when "italic"
      "<i>"
    when "underline"
      "<u>"
    when "strikethrough"
      "<s>"
    when "code"
      "<code>"
    when "pre"
      "<pre>"
    when "text_link"
      "<a href=\"#{tag["url"]}\">"
    when "mention"
      "<a href=\"https://t.me/#{text[tag["offset"] + 1..tag["offset"] + tag["length"]]}\">"
    when "url"
      "<a href=\"#{text[tag["offset"]..tag["offset"] + tag["length"] - 1]}\">"
    else
      ""
    end
  end

  def end_tag_to_text(tag)
    tags = {
        "bold" => "</b>",
        "italic" => "</i>",
        "underline" => "</u>",
        "strikethrough" => "</s>",
        "code" => "</code>",
        "pre" => "</pre>",
        "text_link" => "</a>",
        "mention" => "</a>",
        "url" => "</a>"
    }
    tags.fetch(tag["type"], "")
  end

roymckenzie · 2022-06-27T15:17:17Z

I was looking for a javascript implementation of this code. Thanks to your example I was able to develop a basic version. Thank you!

const parseTelegramMessage = function (telegramData) {
  const text = telegramData.message.text || telegramData.message.caption
  const entities = telegramData.message.entities || telegramData.message.caption_entities

  if (!entities) {
    return text
  }

  let html = ''

  entities.forEach((entity, index) => {
    // Characters before entity
    if (index === 0) {
      html += text.slice(0, entity.offset)
    }

    // Handle entity transformation
    const entityText = text.slice(entity.offset, entity.offset + entity.length)

    switch (entity.type) {
      case 'bold':
        html += `<strong>${entityText}</strong>`
        break
      case 'text_link':
        html += `<a href="${entity.url}" target="_blank">${entityText}</a>`
        break
      case 'url':
        html += `<a href="${entityText}" target="_blank">${entityText}</a>`
        break
      case 'italic':
        html += `<em>${entityText}</em>`
        break
      case 'mention':
        html += `<a href="https://t.me/${entityText.replace('@', '')}" target="_blank">${entityText}</a>`
        break
      case 'email':
        html += `<a href="mailto:${entityText}">${entityText}</a>`
        break
      case 'phone_number':
        html += `<a href="tel:${entityText}">${entityText}</a>`
        break
      default:
        html += `${entityText}`
    }

    // Characters after entity but before next entity
    if (entities.length > index + 1) {
      html += text.slice(entity.offset + entity.length, entities[index + 1].offset)
    }

    // Last characters after last entity
    if (entities.length === index + 1) {
      html += text.slice(entity.offset + entity.length)
    }
  })

  return html
}

azkadev · 2022-09-08T11:49:28Z

Wow nice can someone give me an example of how to string messages to telegram entities? in javascript

grischka · 2022-12-01T02:18:19Z

This is awesome and helped me a lot. Unfortunately, it causes problems with nested entities.

Example: a URL that should be displayed bold. Currently, it's being parsed like this
<a href="...">URL</a><strong>URL</strong>
It should be this
<a href="..."><strong>URL</strong></a>

Any ideas how to improve this?

LeonidShastel · 2022-12-01T14:50:45Z

Rewrote the code, now you can use multiple styles and attach links

const parseTelegramMessage = (telegramData) => {
    const text = telegramData.message.text || telegramData.message.caption
    const entities = telegramData.message.entities || telegramData.message.caption_entities

    if (!entities) {
        return text
    }

    let tags = [];

    entities.forEach((entity) => {
        const startTag = getTag(entity, text);
        let searchTag = tags.filter(tag=>tag.index===entity.offset)
        if (searchTag.length>0)
            searchTag[0].tag+=startTag;
        else
            tags.push({
                index: entity.offset,
                tag: startTag
            });

        const closeTag = startTag.indexOf("<a ")===0 ? "</a>" : "</"+startTag.slice(1);
        searchTag = tags.filter(tag=>tag.index===entity.offset+entity.length)
        if (searchTag.length>0)
            searchTag[0].tag = closeTag+searchTag[0].tag;
        else
            tags.push({
                index: entity.offset+entity.length,
                tag: closeTag
            })
    })
    let html = "";
    for (let i = 0; i<text.length; i++){
        const tag = tags.filter(tag=>tag.index===i);
        tags = tags.filter(tag=>tag.index!==i);
        if (tag.length>0)
            html+=tag[0].tag;
        html+=text[i];
    }
    if (tags.length>0)
        html+=tags[0].tag

    return html;
}

const getTag = (entity, text) => {
    const entityText = text.slice(entity.offset, entity.offset + entity.length)

    switch (entity.type) {
        case 'bold':
            return `<strong>`
        case 'text_link':
            return `<a href="${entity.url}" target="_blank">`
        case 'url':
            return `<a href="${entityText}" target="_blank">`
        case 'italic':
            return `<em>`
        case "code":
            return `<code>`
        case "strikethrough":
            return `<s>`
        case "underline":
            return `<u>`
        case "pre":
            return `<pre>`
        case 'mention':
            return `<a href="https://t.me/${entityText.replace('@', '')}" target="_blank">`
        case 'email':
            return `<a href="mailto:${entityText}">`
        case 'phone_number':
            return `<a href="tel:${entityText}">`
    }
}

survtur · 2022-12-02T18:45:54Z

Guys, your codes gave me basic ideas and I made my own Python3 solution. It works with nested entities too.

https://github.com/survtur/TelethonMessageToHtmlConverter

ak4zh · 2022-12-19T17:18:58Z

Thanks, @LeonidShastel , used your code and added typescript types to use with grammY

import type { Context } from 'grammy';
import type { MessageEntity } from 'grammy/out/types.node';

export const parseTelegramMessage = (ctx: Context) => {
	const text = ctx.msg?.text;
	const entities = ctx.msg?.entities;

	if (!entities || !text) {
		return text;
	}

	let tags: { index: number; tag: string | undefined }[] = [];

	entities.forEach((entity) => {
		const startTag = getTag(entity, text);
		let searchTag = tags.filter((tag) => tag.index === entity.offset);
		if (searchTag.length > 0 && startTag) searchTag[0].tag += startTag;
		else
			tags.push({
				index: entity.offset,
				tag: startTag
			});

		const closeTag = startTag?.indexOf('<a ') === 0 ? '</a>' : '</' + startTag?.slice(1);
		searchTag = tags.filter((tag) => tag.index === entity.offset + entity.length);
		if (searchTag.length > 0) searchTag[0].tag = closeTag + searchTag[0].tag;
		else
			tags.push({
				index: entity.offset + entity.length,
				tag: closeTag
			});
	});
	let html = '';
	for (let i = 0; i < text.length; i++) {
		const tag = tags.filter((tag) => tag.index === i);
		tags = tags.filter((tag) => tag.index !== i);
		if (tag.length > 0) html += tag[0].tag;
		html += text[i];
	}
	if (tags.length > 0) html += tags[0].tag;

	return html;
};

const getTag = (entity: MessageEntity, text: string) => {
	const entityText = text.slice(entity.offset, entity.offset + entity.length);

	switch (entity.type) {
		case 'bold':
			return `<strong>`;
		case 'text_link':
			return `<a href="${entity.url}" target="_blank">`;
		case 'url':
			return `<a href="${entityText}" target="_blank">`;
		case 'italic':
			return `<em>`;
		case 'code':
			return `<code>`;
		case 'strikethrough':
			return `<s>`;
		case 'underline':
			return `<u>`;
		case 'pre':
			return `<pre>`;
		case 'mention':
			return `<a href="https://t.me/${entityText.replace('@', '')}" target="_blank">`;
		case 'email':
			return `<a href="mailto:${entityText}">`;
		case 'phone_number':
			return `<a href="tel:${entityText}">`;
	}
};

kvalood · 2023-07-19T04:22:20Z

Function for @mtproto/core, for parsing messages

export const parseTelegramMessage = function (msg) {
	const text = msg.message || msg.caption;
	const entities = msg.entities || msg.caption_entities;

	if (!entities) {
		return text;
	}

	let html = "";

	entities.forEach((entity, index) => {
		// Characters before entity
		if (index === 0) {
			html += text.slice(0, entity.offset);
		}

		// Handle entity transformation
		const entityText = text.slice(
			entity.offset,
			entity.offset + entity.length
		);

		switch (entity._) {
			case "messageEntityBold":
				html += `<strong>${entityText}</strong>`;
				break;
			case "messageEntityPre":
				html += `<pre>${entityText}</pre>`;
				break;
			case "messageEntityCode":
				html += `<code>${entityText}</code>`;
				break;
			case "messageEntityStrike":
				html += `<s>${entityText}</s>`;
				break;
			case "messageEntityUnderline":
				html += `<u>${entityText}</u>`;
				break;
			case "messageEntitySpoiler":
				html += `<span class="tg-spoiler">${entityText}</span>`;
				break;
            case "messageEntityUrl":
			case "messageEntityTextUrl":
				html += `<a href="${entity.url}" target="_blank">${entityText}</a>`;
				break;
			case "messageEntityItalic":
				html += `<em>${entityText}</em>`;
				break;
			case "messageEntityMention":
				html += `<a href="https://t.me/${entityText.replace(
					"@",
					""
				)}" target="_blank">${entityText}</a>`;
				break;
			case "messageEntityEmail":
				html += `<a href="mailto:${entityText}">${entityText}</a>`;
				break;
			case "messageEntityPhone":
				html += `<a href="tel:${entityText}">${entityText}</a>`;
				break;
			default:
				html += `${entityText}`;
		}

		// Characters after entity but before next entity
		if (entities.length > index + 1) {
			html += text.slice(
				entity.offset + entity.length,
				entities[index + 1].offset
			);
		}

		// Last characters after last entity
		if (entities.length === index + 1) {
			html += text.slice(entity.offset + entity.length);
		}
	});

	return html;
};

	public function getHTML($without_cmd = false){

	if(empty($this->getEntities())){
	return $this->getText($without_cmd);
	}

	$text = $this->getText();
	$html = '';
	$entities_count = \count($this->getEntities())-1;

	foreach($this->getEntities() as $k => $entity){

	if($k === 0){

	$html .= mb_substr($text, 0, $entity->getOffset());

	}

	switch($entity->getType()){

	default:
	case 'mention':
	case 'hashtag':
	case 'cashtag':
	case 'bot_command':
	case 'url':
	case 'email':
	case 'phone_number':

	$html .= mb_substr($text, $entity->getOffset(), $entity->getLength());

	break;
	case 'text_mention':

	$html .= '<a href="tg://user?id='.$entity->getUser()->getId().'">'.mb_substr($text, $entity->getOffset(), $entity->getLength()).'</a>';

	break;
	case 'text_link':

	$html .= '<a href="'.$entity->getUrl().'">'.mb_substr($text, $entity->getOffset(), $entity->getLength()).'</a>';

	break;

	case 'bold':

	$html .= '<b>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</b>';

	break;

	case 'italic':

	$html .= '<i>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</i>';

	break;
	case 'code':

	$html .= '<code>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</code>';

	break;
	case 'pre':

	$html .= '<pre>' . mb_substr($text, $entity->getOffset(), $entity->getLength()) . '</pre>';

	break;

	}

	if($k === $entities_count){

	$html .= mb_substr($text, $entity->getOffset() + $entity->getLength());

	}

	}

	return $html;

	}

chuv1/tgEntitiesToHTML.php

rozhok commented Mar 13, 2021

Uh oh!

rozhok commented Mar 14, 2021

Uh oh!

roymckenzie commented Jun 27, 2022 •

edited

Loading

Uh oh!

azkadev commented Sep 8, 2022

Uh oh!

grischka commented Dec 1, 2022

Uh oh!

LeonidShastel commented Dec 1, 2022

Uh oh!

survtur commented Dec 2, 2022 •

edited

Loading

Uh oh!

ak4zh commented Dec 19, 2022

Uh oh!

kvalood commented Jul 19, 2023 •

edited

Loading

Uh oh!

chuv1/tgEntitiesToHTML.php

rozhok commented Mar 13, 2021

Uh oh!

rozhok commented Mar 14, 2021

Uh oh!

roymckenzie commented Jun 27, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

azkadev commented Sep 8, 2022

Uh oh!

grischka commented Dec 1, 2022

Uh oh!

LeonidShastel commented Dec 1, 2022

Uh oh!

survtur commented Dec 2, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ak4zh commented Dec 19, 2022

Uh oh!

kvalood commented Jul 19, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

roymckenzie commented Jun 27, 2022 •

edited

Loading

survtur commented Dec 2, 2022 •

edited

Loading

kvalood commented Jul 19, 2023 •

edited

Loading