jh0ker · April 9, 2021 00:07
diff --git a/telegram_example.py b/telegram_example.py
 """
 The original reason I wrote this.
 Also provides a more real-world example on how to use the result.

 Uses python-telegram-bot for sending messages to Telegram.
 """

 import html
 from typing import List

 from telegram import Bot

 from tokenizer import Kind, Token, make_api_client, tokenize

 TELEGRAM_TOKEN = ""


 def render_for_telegram(tokens: List[Token]) -> str:
    """
    Example renderer that ignores media URLs, shows the display URL for regular URLs
    and adds some HTML formatting to make user mentions link to the twitter
    profile of the mentioned user.
    """

    result = ""

    for token in tokens:
        if token.kind == Kind.USER_MENTION:
            # Format user mentions using HTML-style formatting
            # See https://core.telegram.org/bots/api#html-style
            screen_name = token.data["screen_name"]
            result += f'<a href="https://twitter.com/{screen_name}">{token.text}</a>'

        elif token.kind == Kind.URL:
            # Instead of ugly t.co shortlinks, we can use the display_url
            # and expanded_url to create a nice link in Telegram
            display_url = token.data["display_url"]
            expanded_url = token.data["expanded_url"]
            result += f'<a href="{expanded_url}">{display_url}</a>'

        elif token.kind == Kind.MEDIA:
            # For media URLs, simply do nothing
            pass

        else:
            # Make sure we escape all other text to prevent parsing errors, see link above
            result += html.escape(token.text)

    return result


 if __name__ == "__main__":
    api = make_api_client()

    # Tweet with some entities and special characters (Emoji, <, > and &)
    tweet = api.get_status("1265545575203774465", tweet_mode="extended")

    # Tweet with a url entity
    # tweet = api.get_status("1349440375711215618", tweet_mode="extended")

    # Tweet with media
    # tweet = api.get_status("903108707571171328", tweet_mode="extended")

    tokens = tokenize(tweet)
    print("Custom render of tweet for Telegram:")
    print(render_for_telegram(tokens))

    # Make Telegram bot instance
    bot = Bot(TELEGRAM_TOKEN)

    # Send rendered message to Telegram with HTML parse_mode enabled
    bot.send_message(
        chat_id="10049375",
        text=render_for_telegram(tokens),
        parse_mode="HTML",
        disable_web_page_preview=True,
    )
diff --git a/tokenizer.py b/tokenizer.py
 from typing import List
 import html
 from dataclasses import dataclass
 from enum import Enum

 import tweepy

 API_KEY = ""
 API_SECRET = ""


 ### Utility classes ###


 class Kind(Enum):
    """
    Describes the kind of entity/token.
    String value follows Twitter API names.
    """

    TEXT = "text"  # Used for regular text between entities
    HASHTAG = "hashtags"
    SYMBOL = "symbols"
    USER_MENTION = "user_mentions"
    URL = "urls"
    MEDIA = "media"


 @dataclass
 class Entity:
    """ Extracted entity from Tweet JSON """

    data: dict
    kind: Kind


 @dataclass
 class Token:
    """ Part of the tokenized Tweet """

    text: str
    data: dict
    kind: Kind


 ### Algorithm for tokenizing tweet ###


 def entity_sorter(entity: Entity) -> int:
    """ Returns the first (starting) index of the entity for sorting """
    return entity.data["indices"][0]


 def get_entities(tweet: tweepy.Status) -> List[Entity]:
    """
    Put all the different kinds of entities from the tweet into one list,
    wrapped in Entity objects and sorted by their start index.
    Each Entity object has a flag about which kind it is, depending on which list it came from.
    """
    all_entities = []

    for entity_kind, entities in tweet.entities.items():
        all_entities.extend(
            [
                Entity(data=entity_data, kind=Kind(entity_kind))
                for entity_data in entities
            ]
        )

    all_entities = sorted(all_entities, key=entity_sorter)

    return all_entities


 def tokenize(tweet: tweepy.Status) -> List[Token]:
    """
    Uses the entities in the API response to break up the tweet text into parts, called Tokens.
    Each token has the text of that part, the original entity data from the API response
    and the kind (regular text, hashtag, mention etc.)
    """
    tweet_text = tweet.full_text
    entities = get_entities(tweet)

    tokens = []
    current_index = 0

    for entity in entities:
        start_index, end_index = entity.data["indices"]

        # Check if there is regular text between the last
        # entity (or the start of the tweet) and this one
        # and create a Token for it
        if current_index != start_index:
            tokens.append(
                Token(
                    html.unescape(tweet_text[current_index:start_index]),
                    data=None,
                    kind=Kind.TEXT,
                )
            )

        # Convert Entity objects into Token objects by extracting their text
        tokens.append(
            Token(
                text=tweet_text[start_index:end_index],
                data=entity.data,
                kind=entity.kind,
            )
        )

        current_index = end_index

    # Make sure we get the rest of the tweet as a text token if the tweet either
    # - Doesn't end with an entity, or
    # - Doesn't have any entities at all
    if current_index < len(tweet_text):
        tokens.append(
            Token(
                text=html.unescape(tweet_text[current_index:]),
                data=None,
                kind=Kind.TEXT,
            )
        )

    return tokens


 ### Demo ###


 def render_simple(tokens: List[Token]) -> str:
    """
    Most basic example of rendering text from a list of tokens.
    Should always re-create the original text of the tweet.
    """
    return "".join(token.text for token in tokens)


 def make_api_client():
    """ Set up API client for Twitter """
    auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
    api = tweepy.API(auth)

    return api


 if __name__ == "__main__":
    api = make_api_client()

    # Tweet with some entities and special characters (Emoji, <, > and &)
    tweet = api.get_status("1265545575203774465", tweet_mode="extended")

    # Tweet with no entities at all
    # tweet = api.get_status("1379120489474498560", tweet_mode="extended")

    # Run tweet tokenizer
    tokens = tokenize(tweet)

    # See what the tokens look like
    for token in tokens:
        print(token, "\n")

    print("Simple render of tweet:")
    print(render_simple(tokens))
	"""
	The original reason I wrote this.
	Also provides a more real-world example on how to use the result.

	Uses python-telegram-bot for sending messages to Telegram.
	"""

	import html
	from typing import List

	from telegram import Bot

	from tokenizer import Kind, Token, make_api_client, tokenize

	TELEGRAM_TOKEN = ""


	def render_for_telegram(tokens: List[Token]) -> str:
	"""
	Example renderer that ignores media URLs, shows the display URL for regular URLs
	and adds some HTML formatting to make user mentions link to the twitter
	profile of the mentioned user.
	"""

	result = ""

	for token in tokens:
	if token.kind == Kind.USER_MENTION:
	# Format user mentions using HTML-style formatting
	# See https://core.telegram.org/bots/api#html-style
	screen_name = token.data["screen_name"]
	result += f'<a href="https://twitter.com/{screen_name}">{token.text}</a>'

	elif token.kind == Kind.URL:
	# Instead of ugly t.co shortlinks, we can use the display_url
	# and expanded_url to create a nice link in Telegram
	display_url = token.data["display_url"]
	expanded_url = token.data["expanded_url"]
	result += f'<a href="{expanded_url}">{display_url}</a>'

	elif token.kind == Kind.MEDIA:
	# For media URLs, simply do nothing
	pass

	else:
	# Make sure we escape all other text to prevent parsing errors, see link above
	result += html.escape(token.text)

	return result


	if __name__ == "__main__":
	api = make_api_client()

	# Tweet with some entities and special characters (Emoji, <, > and &)
	tweet = api.get_status("1265545575203774465", tweet_mode="extended")

	# Tweet with a url entity
	# tweet = api.get_status("1349440375711215618", tweet_mode="extended")

	# Tweet with media
	# tweet = api.get_status("903108707571171328", tweet_mode="extended")

	tokens = tokenize(tweet)
	print("Custom render of tweet for Telegram:")
	print(render_for_telegram(tokens))

	# Make Telegram bot instance
	bot = Bot(TELEGRAM_TOKEN)

	# Send rendered message to Telegram with HTML parse_mode enabled
	bot.send_message(
	chat_id="10049375",
	text=render_for_telegram(tokens),
	parse_mode="HTML",
	disable_web_page_preview=True,
	)
	from typing import List
	import html
	from dataclasses import dataclass
	from enum import Enum

	import tweepy

	API_KEY = ""
	API_SECRET = ""


	### Utility classes ###


	class Kind(Enum):
	"""
	Describes the kind of entity/token.
	String value follows Twitter API names.
	"""

	TEXT = "text" # Used for regular text between entities
	HASHTAG = "hashtags"
	SYMBOL = "symbols"
	USER_MENTION = "user_mentions"
	URL = "urls"
	MEDIA = "media"


	@dataclass
	class Entity:
	""" Extracted entity from Tweet JSON """

	data: dict
	kind: Kind


	@dataclass
	class Token:
	""" Part of the tokenized Tweet """

	text: str
	data: dict
	kind: Kind


	### Algorithm for tokenizing tweet ###


	def entity_sorter(entity: Entity) -> int:
	""" Returns the first (starting) index of the entity for sorting """
	return entity.data["indices"][0]


	def get_entities(tweet: tweepy.Status) -> List[Entity]:
	"""
	Put all the different kinds of entities from the tweet into one list,
	wrapped in Entity objects and sorted by their start index.
	Each Entity object has a flag about which kind it is, depending on which list it came from.
	"""
	all_entities = []

	for entity_kind, entities in tweet.entities.items():
	all_entities.extend(
	[
	Entity(data=entity_data, kind=Kind(entity_kind))
	for entity_data in entities
	]
	)

	all_entities = sorted(all_entities, key=entity_sorter)

	return all_entities


	def tokenize(tweet: tweepy.Status) -> List[Token]:
	"""
	Uses the entities in the API response to break up the tweet text into parts, called Tokens.
	Each token has the text of that part, the original entity data from the API response
	and the kind (regular text, hashtag, mention etc.)
	"""
	tweet_text = tweet.full_text
	entities = get_entities(tweet)

	tokens = []
	current_index = 0

	for entity in entities:
	start_index, end_index = entity.data["indices"]

	# Check if there is regular text between the last
	# entity (or the start of the tweet) and this one
	# and create a Token for it
	if current_index != start_index:
	tokens.append(
	Token(
	html.unescape(tweet_text[current_index:start_index]),
	data=None,
	kind=Kind.TEXT,
	)
	)

	# Convert Entity objects into Token objects by extracting their text
	tokens.append(
	Token(
	text=tweet_text[start_index:end_index],
	data=entity.data,
	kind=entity.kind,
	)
	)

	current_index = end_index

	# Make sure we get the rest of the tweet as a text token if the tweet either
	# - Doesn't end with an entity, or
	# - Doesn't have any entities at all
	if current_index < len(tweet_text):
	tokens.append(
	Token(
	text=html.unescape(tweet_text[current_index:]),
	data=None,
	kind=Kind.TEXT,
	)
	)

	return tokens


	### Demo ###


	def render_simple(tokens: List[Token]) -> str:
	"""
	Most basic example of rendering text from a list of tokens.
	Should always re-create the original text of the tweet.
	"""
	return "".join(token.text for token in tokens)


	def make_api_client():
	""" Set up API client for Twitter """
	auth = tweepy.AppAuthHandler(API_KEY, API_SECRET)
	api = tweepy.API(auth)

	return api


	if __name__ == "__main__":
	api = make_api_client()

	# Tweet with some entities and special characters (Emoji, <, > and &)
	tweet = api.get_status("1265545575203774465", tweet_mode="extended")

	# Tweet with no entities at all
	# tweet = api.get_status("1379120489474498560", tweet_mode="extended")

	# Run tweet tokenizer
	tokens = tokenize(tweet)

	# See what the tokens look like
	for token in tokens:
	print(token, "\n")

	print("Simple render of tweet:")
	print(render_simple(tokens))