Created
July 10, 2018 17:06
-
-
Save antvconst/fdf409d1e36e6c82c03d8a6f0d2a82b8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pandas_profiling import ProfileReport as profile\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import seaborn as sns\n", | |
"import scipy.stats as stats\n", | |
"import matplotlib.pyplot as plt\n", | |
"import os\n", | |
"import string\n", | |
"\n", | |
"from bs4 import BeautifulSoup\n", | |
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Imputer, StandardScaler\n", | |
"from sklearn.base import BaseEstimator, TransformerMixin\n", | |
"from sklearn.model_selection import train_test_split, cross_val_score, cross_validate\n", | |
"from sklearn.linear_model import SGDClassifier\n", | |
"from sklearn.metrics import f1_score\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.decomposition import IncrementalPCA as iPCA\n", | |
"\n", | |
"import nltk\n", | |
"#nltk.download('stopwords')\n", | |
"#nltk.download('wordnet')\n", | |
"from nltk.util import bigrams, trigrams\n", | |
"from nltk import word_tokenize\n", | |
"from nltk.stem.wordnet import WordNetLemmatizer\n", | |
"import re\n", | |
"from nltk.corpus import stopwords\n", | |
"\n", | |
"import pickle\n", | |
" \n", | |
"cachedStopWords = stopwords.words(\"english\")\n", | |
"\n", | |
"import warnings\n", | |
"warnings.filterwarnings('ignore')\n", | |
"\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def remove_tags(text):\n", | |
" return BeautifulSoup(text).get_text()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Загрузка и предобработка всех текстов." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"docs = []\n", | |
"labels = []\n", | |
"\n", | |
"for filename in os.listdir('aclImdb\\\\train\\\\pos\\\\'):\n", | |
" with open('aclImdb\\\\train\\\\pos\\\\' + filename, 'r', encoding='utf8') as f:\n", | |
" docs.append(remove_tags(f.read()))\n", | |
" labels.append(1)\n", | |
" \n", | |
"for filename in os.listdir('aclImdb\\\\train\\\\neg\\\\'):\n", | |
" with open('aclImdb\\\\train\\\\neg\\\\' + filename, 'r', encoding='utf8') as f:\n", | |
" docs.append(remove_tags(f.read()))\n", | |
" labels.append(0)\n", | |
" \n", | |
"for filename in os.listdir('aclImdb\\\\test\\\\pos\\\\'):\n", | |
" with open('aclImdb\\\\test\\\\pos\\\\' + filename, 'r', encoding='utf8') as f:\n", | |
" docs.append(remove_tags(f.read()))\n", | |
" labels.append(1)\n", | |
"\n", | |
"for filename in os.listdir('aclImdb\\\\test\\\\neg\\\\'):\n", | |
" with open('aclImdb\\\\test\\\\neg\\\\' + filename, 'r', encoding='utf8') as f:\n", | |
" docs.append(remove_tags(f.read()))\n", | |
" labels.append(0)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Объём данных немаленький, поэтому обработка идёт долго. Чтобы не пересчитывать каждый раз, можно мариновать результаты." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def dump(obj, filename):\n", | |
" with open(filename, 'wb') as file:\n", | |
" pickle.dump(obj, file)\n", | |
" \n", | |
"def load(filename):\n", | |
" with open(filename, 'rb') as file:\n", | |
" obj = pickle.load(file)\n", | |
" return obj" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dump(docs, 'docs.dump')\n", | |
"dump(labels, 'labels.dump')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Размариновать их можно следующим образом." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"docs = load('docs.dump')\n", | |
"labels = load('labels.dump')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"positive_emoticons = [':)', '=)', '(=', '(:', ':‑)', 'xD', ':D', '=D', ';)', '<3', ':3']\n", | |
"negative_emoticons = [':(', '=(', 'x_x', ':|', '']\n", | |
"with open('bad-words.txt', 'r') as bw_file:\n", | |
" bad_words = [w for w in bw_file.read().split('\\n')]\n", | |
"\n", | |
"# check if char is letter or whitespace\n", | |
"def check_char(c):\n", | |
" return (c in string.ascii_lowercase) or \\\n", | |
" (c in string.ascii_uppercase) or \\\n", | |
" (c == ' ')\n", | |
"\n", | |
"def count(text, tokens):\n", | |
" cnt = 0\n", | |
" for token in tokens:\n", | |
" cnt += text.count(token)\n", | |
" \n", | |
"def extract_features(text):\n", | |
" pos_emo_cnt = count(text, positive_emoticons)\n", | |
" neg_emo_cnt = count(text, negative_emoticons)\n", | |
" excl_cnt = text.count('!')\n", | |
" question_cnt = text.count('?')\n", | |
" bad_words_cnt = count(text, bad_words)\n", | |
" \n", | |
" return pos_emo_cnt, neg_emo_cnt, excl_cnt, question_cnt\n", | |
"\n", | |
"def tokenize(text):\n", | |
" t_filtered = ''.join([c.lower() for c in text if check_char(c)])\n", | |
" words = t_filtered.split(' ')\n", | |
" \n", | |
" wn_lemmatizer = WordNetLemmatizer()\n", | |
" \n", | |
" tokens = list(map(lambda token: wn_lemmatizer.lemmatize(token),\n", | |
" words))\n", | |
" \n", | |
" bigrams_ = [' '.join(p) for p in bigrams(tokens)]\n", | |
" trigrams_ = [' '.join(p) for p in trigrams(tokens)]\n", | |
" \n", | |
" return tokens + bigrams_ + trigrams_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"docs_train, docs_test, labels_train, labels_test = train_test_split(docs, labels, test_size=0.3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def tf_idf(docs):\n", | |
" tfidf = TfidfVectorizer(tokenizer=tokenize,\n", | |
" min_df=3,\n", | |
" max_df=0.90,\n", | |
" max_features=3000,\n", | |
" use_idf=True,\n", | |
" sublinear_tf=True,\n", | |
" norm='l2',\n", | |
" stop_words=cachedStopWords)\n", | |
" tfidf.fit(docs)\n", | |
" return tfidf\n", | |
"\n", | |
"def feature_values(doc, representer):\n", | |
" doc_representation = representer.transform([doc])\n", | |
" features = representer.get_feature_names()\n", | |
" return [(features[index], doc_representation[0, index])\n", | |
" for index in doc_representation.nonzero()[1]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#representer = tf_idf(docs_train)\n", | |
"#dump(representer, 'tfidf.dump')\n", | |
"\n", | |
"representer = load('tfidf.dump')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Его тоже можно замариновать :)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#vec_docs_train = representer.transform(docs_train)\n", | |
"#vec_docs_test = representer.transform(docs_test)\n", | |
"#dump(vec_docs_train, 'vec_docs_train.dump')\n", | |
"#dump(vec_docs_test, 'vec_docs_test.dump')\n", | |
"\n", | |
"vec_docs_train = load('vec_docs_train.dump')\n", | |
"vec_docs_test = load('vec_docs_test.dump')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.9386397948446905\n", | |
"CPU times: user 27min 5s, sys: 2min 25s, total: 29min 31s\n", | |
"Wall time: 8min 5s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"n = vec_docs_train.shape[0]\n", | |
"\n", | |
"n_components = 2400\n", | |
"chunk_size = n_components + 100\n", | |
"\n", | |
"ipca = iPCA(n_components=n_components) \n", | |
"\n", | |
"vec_docs_train_arr = vec_docs_train.toarray()\n", | |
"vec_docs_test_arr = vec_docs_test.toarray()\n", | |
"\n", | |
"for i in range(0, n // chunk_size):\n", | |
" ipca.partial_fit(vec_docs_train_arr[i*chunk_size : (i+1)*chunk_size])\n", | |
" \n", | |
"print(np.sum(ipca.explained_variance_ratio_))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(35000, 2400) (15000, 2400)\n", | |
"CPU times: user 2min 32s, sys: 1.42 s, total: 2min 34s\n", | |
"Wall time: 39.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"vec_docs_train_ipca = ipca.transform(vec_docs_train_arr)\n", | |
"vec_docs_test_ipca = ipca.transform(vec_docs_test_arr)\n", | |
"\n", | |
"print(np.shape(vec_docs_train_ipca), np.shape(vec_docs_test_ipca))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment