Created
June 20, 2019 13:07
-
-
Save jiaxianhua/71b72c2f53eae87c34da66db19f0d01a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print('start') | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression #导入线性回归库 | |
from sklearn.feature_extraction.text import CountVectorizer #导入特征提取库 | |
#读取文件,并且删除无关东西 | |
df_train = pd.read_csv('/Users/apple/Documents/ST/python/competition/DataCastel/text_intelligence/new_data/train_set.csv') | |
df_test = pd.read_csv('/Users/apple/Documents/ST/python/competition/DataCastel/text_intelligence/new_data/test_set.csv') | |
df_train.drop(columns =['article', 'id'], inplace = True ) #问题1: 为什么要删除这两个列,id列没有意义,不需要用article,直接删除 | |
df_test.drop(columns =['article'], inplace = True ) | |
#获取特征向量 | |
vectorizer = CountVectorizer(ngram_range = (1,2), min_df = 3, max_df = 0.9, max_features = 100000) #提取特征 | |
vectorizer.fit(df_train['word_seg']) #问题2:为啥要训练这一列内容,要先学习整个数据集的词的DF(文档词频) | |
x_train = vectorizer.transform(df_train['word_seg']) #特征转为特征向量 | |
x_test = vectorizer.transform(df_test['word_seg']) | |
y_train = df_train['class'] - 1 #问题3:这里为啥要给所有的类别都减去1,减一是代码习惯问题,让class从0计数 | |
lg = LogisticRegression(C = 4, dual = True) #逻辑回归初始化 | |
lg.fit(x_train, y_train) #进行训练,模型保存在lg里面 | |
y_test = lg.predict(x_test) #用模型进行测试 | |
df_test['class'] = y_test.tolist() #测试结果转为列表,并且放入测试文档的类别里面。问题5:测试文档没有类别这个列。这行代码会自动给测试文档添加一个类别列。 | |
df_test['class'] = df_test['class'] + 1 #问题4:为啥又要给所有类别分别加1 | |
df_result = df_test.loc[:, ['id', 'class']] #从测试集里面拿到'id', 'class']]列的内容 | |
df_result.to_csv('/Users/apple/Documents/ST/python/competition/DataCastel/text_intelligence/new_data/result.csv', index = False) #测试结果转为提交的CSV格式 | |
print('end') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment