Last active
August 2, 2019 18:59
-
-
Save sismetanin/4eac7523810b75f6939d9871f52f19c6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ekphrasis.classes.preprocessor import TextPreProcessor | |
from ekphrasis.classes.tokenizer import SocialTokenizer | |
from ekphrasis.dicts.emoticons import emoticons | |
import numpy as np | |
import re | |
import io | |
label2emotion = {0: "others", 1: "happy", 2: "sad", 3: "angry"} | |
emotion2label = {"others": 0, "happy": 1, "sad": 2, "angry": 3} | |
emoticons_additional = { | |
'(^・^)': '<happy>', ':‑c': '<sad>', '=‑d': '<happy>', ":'‑)": '<happy>', ':‑d': '<laugh>', | |
':‑(': '<sad>', ';‑)': '<happy>', ':‑)': '<happy>', ':\\/': '<sad>', 'd=<': '<annoyed>', | |
':‑/': '<annoyed>', ';‑]': '<happy>', '(^�^)': '<happy>', 'angru': 'angry', "d‑':": | |
'<annoyed>', ":'‑(": '<sad>', ":‑[": '<annoyed>', '(�?�)': '<happy>', 'x‑d': '<laugh>', | |
} | |
text_processor = TextPreProcessor( | |
# terms that will be normalized | |
normalize=['url', 'email', 'percent', 'money', 'phone', 'user', | |
'time', 'url', 'date', 'number'], | |
# terms that will be annotated | |
annotate={"hashtag", "allcaps", "elongated", "repeated", | |
'emphasis', 'censored'}, | |
fix_html=True, # fix HTML tokens | |
# corpus from which the word statistics are going to be used | |
# for word segmentation | |
segmenter="twitter", | |
# corpus from which the word statistics are going to be used | |
# for spell correction | |
corrector="twitter", | |
unpack_hashtags=True, # perform word segmentation on hashtags | |
unpack_contractions=True, # Unpack contractions (can't -> can not) | |
spell_correct_elong=True, # spell correction for elongated words | |
# select a tokenizer. You can use SocialTokenizer, or pass your own | |
# the tokenizer, should take as input a string and return a list of tokens | |
tokenizer=SocialTokenizer(lowercase=True).tokenize, | |
# list of dictionaries, for replacing tokens extracted from the text, | |
# with other expressions. You can pass more than one dictionaries. | |
dicts=[emoticons, emoticons_additional] | |
) | |
def tokenize(text): | |
text = " ".join(text_processor.pre_process_doc(text)) | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment