Created
April 9, 2021 00:07
-
-
Save jh0ker/6aaf581c484a2008e03bea91df012396 to your computer and use it in GitHub Desktop.
Tokenize Tweets using entities provided by Twitter API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
The original reason I wrote this. | |
Also provides a more real-world example on how to use the result. | |
Uses python-telegram-bot for sending messages to Telegram. | |
""" | |
import html | |
from typing import List | |
from telegram import Bot | |
from tokenizer import Kind, Token, make_api_client, tokenize | |
TELEGRAM_TOKEN = "" | |
def render_for_telegram(tokens: List[Token]) -> str: | |
""" | |
Example renderer that ignores media URLs, shows the display URL for regular URLs | |
and adds some HTML formatting to make user mentions link to the twitter | |
profile of the mentioned user. | |
""" | |
result = "" | |
for token in tokens: | |
if token.kind == Kind.USER_MENTION: | |
# Format user mentions using HTML-style formatting | |
# See https://core.telegram.org/bots/api#html-style | |
screen_name = token.data["screen_name"] | |
result += f'<a href="https://twitter.com/{screen_name}">{token.text}</a>' | |
elif token.kind == Kind.URL: | |
# Instead of ugly t.co shortlinks, we can use the display_url | |
# and expanded_url to create a nice link in Telegram | |
display_url = token.data["display_url"] | |
expanded_url = token.data["expanded_url"] | |
result += f'<a href="{expanded_url}">{display_url}</a>' | |
elif token.kind == Kind.MEDIA: | |
# For media URLs, simply do nothing | |
pass | |
else: | |
# Make sure we escape all other text to prevent parsing errors, see link above | |
result += html.escape(token.text) | |
return result | |
if __name__ == "__main__": | |
api = make_api_client() | |
# Tweet with some entities and special characters (Emoji, <, > and &) | |
tweet = api.get_status("1265545575203774465", tweet_mode="extended") | |
# Tweet with a url entity | |
# tweet = api.get_status("1349440375711215618", tweet_mode="extended") | |
# Tweet with media | |
# tweet = api.get_status("903108707571171328", tweet_mode="extended") | |
tokens = tokenize(tweet) | |
print("Custom render of tweet for Telegram:") | |
print(render_for_telegram(tokens)) | |
# Make Telegram bot instance | |
bot = Bot(TELEGRAM_TOKEN) | |
# Send rendered message to Telegram with HTML parse_mode enabled | |
bot.send_message( | |
chat_id="10049375", | |
text=render_for_telegram(tokens), | |
parse_mode="HTML", | |
disable_web_page_preview=True, | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
import html | |
from dataclasses import dataclass | |
from enum import Enum | |
import tweepy | |
API_KEY = "" | |
API_SECRET = "" | |
### Utility classes ### | |
class Kind(Enum): | |
""" | |
Describes the kind of entity/token. | |
String value follows Twitter API names. | |
""" | |
TEXT = "text" # Used for regular text between entities | |
HASHTAG = "hashtags" | |
SYMBOL = "symbols" | |
USER_MENTION = "user_mentions" | |
URL = "urls" | |
MEDIA = "media" | |
@dataclass | |
class Entity: | |
""" Extracted entity from Tweet JSON """ | |
data: dict | |
kind: Kind | |
@dataclass | |
class Token: | |
""" Part of the tokenized Tweet """ | |
text: str | |
data: dict | |
kind: Kind | |
### Algorithm for tokenizing tweet ### | |
def entity_sorter(entity: Entity) -> int: | |
""" Returns the first (starting) index of the entity for sorting """ | |
return entity.data["indices"][0] | |
def get_entities(tweet: tweepy.Status) -> List[Entity]: | |
""" | |
Put all the different kinds of entities from the tweet into one list, | |
wrapped in Entity objects and sorted by their start index. | |
Each Entity object has a flag about which kind it is, depending on which list it came from. | |
""" | |
all_entities = [] | |
for entity_kind, entities in tweet.entities.items(): | |
all_entities.extend( | |
[ | |
Entity(data=entity_data, kind=Kind(entity_kind)) | |
for entity_data in entities | |
] | |
) | |
all_entities = sorted(all_entities, key=entity_sorter) | |
return all_entities | |
def tokenize(tweet: tweepy.Status) -> List[Token]: | |
""" | |
Uses the entities in the API response to break up the tweet text into parts, called Tokens. | |
Each token has the text of that part, the original entity data from the API response | |
and the kind (regular text, hashtag, mention etc.) | |
""" | |
tweet_text = tweet.full_text | |
entities = get_entities(tweet) | |
tokens = [] | |
current_index = 0 | |
for entity in entities: | |
start_index, end_index = entity.data["indices"] | |
# Check if there is regular text between the last | |
# entity (or the start of the tweet) and this one | |
# and create a Token for it | |
if current_index != start_index: | |
tokens.append( | |
Token( | |
html.unescape(tweet_text[current_index:start_index]), | |
data=None, | |
kind=Kind.TEXT, | |
) | |
) | |
# Convert Entity objects into Token objects by extracting their text | |
tokens.append( | |
Token( | |
text=tweet_text[start_index:end_index], | |
data=entity.data, | |
kind=entity.kind, | |
) | |
) | |
current_index = end_index | |
# Make sure we get the rest of the tweet as a text token if the tweet either | |
# - Doesn't end with an entity, or | |
# - Doesn't have any entities at all | |
if current_index < len(tweet_text): | |
tokens.append( | |
Token( | |
text=html.unescape(tweet_text[current_index:]), | |
data=None, | |
kind=Kind.TEXT, | |
) | |
) | |
return tokens | |
### Demo ### | |
def render_simple(tokens: List[Token]) -> str: | |
""" | |
Most basic example of rendering text from a list of tokens. | |
Should always re-create the original text of the tweet. | |
""" | |
return "".join(token.text for token in tokens) | |
def make_api_client(): | |
""" Set up API client for Twitter """ | |
auth = tweepy.AppAuthHandler(API_KEY, API_SECRET) | |
api = tweepy.API(auth) | |
return api | |
if __name__ == "__main__": | |
api = make_api_client() | |
# Tweet with some entities and special characters (Emoji, <, > and &) | |
tweet = api.get_status("1265545575203774465", tweet_mode="extended") | |
# Tweet with no entities at all | |
# tweet = api.get_status("1379120489474498560", tweet_mode="extended") | |
# Run tweet tokenizer | |
tokens = tokenize(tweet) | |
# See what the tokens look like | |
for token in tokens: | |
print(token, "\n") | |
print("Simple render of tweet:") | |
print(render_simple(tokens)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment