Created
April 3, 2017 09:57
-
-
Save GINK03/28630674405219fe152fb4d5e92557cb to your computer and use it in GitHub Desktop.
char level 絵文字を意識したfasttextによるベクトル化
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DATA_SIZE = 300000 | |
def _make_char_index(): | |
char_index = {} | |
for char in open('./char_level.txt', 'r').read().replace('\n', ' ').split(): | |
emoji = re.compile(u'[' | |
u'\U0001F300-\U0001F5FF' | |
u'\U0001F600-\U0001F64F' | |
u'\U0001F680-\U0001F6FF' | |
u'\u2600-\u26FF\u2700-\u27BF]+', | |
re.UNICODE) | |
if re.match(emoji, char) is not None: | |
continue | |
if char_index.get(char) is not None: | |
continue | |
char_index[char] = len(char_index) | |
print(char) | |
open('char_index.pkl', 'wb').write(pickle.dumps(char_index)) | |
sys.exit() | |
def make_char(): | |
os.system('rm -rf ./dataset') | |
os.system('mkdir dataset') | |
f = open('char_level.txt', 'w') | |
for ni, name in enumerate(glob.glob('../out20170325/*')): | |
if ni%10000 == 0: | |
print("iter %d"%ni, file=sys.stderr) | |
if ni > DATA_SIZE : break | |
try: | |
obj = json.loads(open(name).read()) | |
except: | |
continue | |
text = obj['txt'] | |
emoji = re.compile(u'[' | |
u'\U0001F300-\U0001F5FF' | |
u'\U0001F600-\U0001F64F' | |
u'\U0001F680-\U0001F6FF' | |
u'\u2600-\u26FF\u2700-\u27BF]+', | |
re.UNICODE) | |
if re.search(emoji, text) is None: | |
continue | |
emojis = re.findall(emoji, text) | |
for p in [r' ', r'\n', r' ', emoji]: | |
no_emoji = re.sub(p, '', text) | |
emojis.extend(list(no_emoji)) | |
f.write("%s\n"%' '.join(emojis)) | |
# ./fasttext skipgram -input char_level.txt -output model -dim 256 -minCount 1 | |
os.system("./fasttext skipgram -input char_level.txt -output model -dim 256 -minCount 1") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment