Last active
November 23, 2020 14:47
-
-
Save vipmax/27800a7beca430adf37fef49b292a07a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
reload(sys) # Reload does the trick! | |
sys.setdefaultencoding('UTF8') | |
import tarfile | |
# extract dictionary package | |
tar = tarfile.open("polyglot_data/sentiment2/ru/ru.sent.pkl.tar.bz2") | |
tar.extractall() | |
tar.close() | |
# read all new words | |
import io | |
new_words = [] | |
new_words_values = [] | |
new_data_file = io.open('project/data.csv','r', encoding='utf8') | |
for line in new_data_file.readlines(): | |
l = line.split(',') | |
new_words.append(l[0]) | |
new_words_values.append([int(l[1].replace('\n',''))]) | |
for i in range(len(new_words)): | |
print new_words[i], new_words_values[i] | |
# open dictionary | |
import pickle | |
f = open('data/tmp/sentiment/ru/ru.sent.pkl', 'r') | |
dictionary_words, dictionary_words_values = pickle.load(f) | |
for i in range(len(dictionary_words)): | |
print dictionary_words[i], dictionary_words_values[i] | |
dictionary_words = list(dictionary_words) | |
dictionary_words_values = list(dictionary_words_values) | |
# add new words to dictionary | |
for i in range(len(new_words)): | |
if new_words[i] not in dictionary_words: | |
print 'adding ', new_words[i], new_words_values[i] | |
dictionary_words.extend([new_words[i]]) | |
dictionary_words_values.extend([new_words_values[i]]) | |
else: print 'already exist word =', new_words[i] | |
# write new dictionary to pickle file | |
result = (dictionary_words, dictionary_words_values) | |
with open('data/tmp/sentiment/ru/ru.sent.pkl', 'w+') as f: | |
pickle.dump(result, f) | |
# add dictionary to package | |
import tarfile | |
tar = tarfile.open("polyglot_data/sentiment2/ru/ru.sent.pkl.tar.bz2", "w") | |
tar.add("data/tmp/sentiment/ru/ru.sent.pkl") | |
tar.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sudo apt-get install python-pip python-dev python-numpy libicu-dev -y | |
sudo pip install polyglot | |
polyglot download sentiment2.en | |
polyglot download sentiment2.ru | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
reload(sys) # Reload does the trick! | |
sys.setdefaultencoding('UTF8') | |
from polyglot.text import Text | |
text = Text("Нужно построить зиккурат") | |
print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30) | |
for w in text.words: | |
print("{:<16}{:>2}".format(w, w.polarity)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from polyglot.text import Text | |
text = Text(u"Нужно построить зиккурат") | |
print("{:<16}{}".format("Word", "Polarity")+"\n"+"-"*30) | |
for w in text.words: | |
print("{:<16}{:>2}".format(w.encode('utf-8'), w.polarity)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment