The use of TF-IDF and LinearSVC is copied verbatim from the scikit-learn text analysis tutorial on about 5,000 columns gathered across 11 NYT columnists, for example, Maureen Dowd columns as listed on /column/maureen-dowd.
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
data_folder = "./data-hold/cleaned/"
sh_dataset = load_files(data_folder, shuffle = True)
sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split(
    sh_dataset.data, sh_dataset.target, test_size=0.25, random_state=None)
sh_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
    ('clf', LinearSVC(C=1000)),
])
sh_pipeline.fit(sh_docs_train, sh_y_train)
sh_y_predicted = sh_pipeline.predict(sh_docs_test)
# print the results
print(metrics.classification_report(sh_y_test, sh_y_predicted, target_names = sh_dataset.target_names))Initial results:
                   precision    recall  f1-score   support
   charles-m-blow       0.99      0.94      0.96        81
     david-brooks       0.98      0.98      0.98       169
      frank-bruni       1.00      0.98      0.99        64
     gail-collins       0.99      0.98      0.98       167
       joe-nocera       0.95      0.95      0.95        76
     maureen-dowd       0.95      0.98      0.96       125
 nicholas-kristof       0.93      0.96      0.95       134
     paul-krugman       0.98      0.99      0.98       157
      roger-cohen       0.99      0.99      0.99       115
     ross-douthat       1.00      0.94      0.97        49
thomas-l-friedman       0.98      0.98      0.98       126
      avg / total       0.97      0.97      0.97      1263
import numpy as np
clf = pipeline.steps[1][1]
vect = pipeline.steps[0][1]
feature_names = vect.get_feature_names()
class_labels = dataset.target_names
for i, class_label in enumerate(class_labels):
            topt = np.argsort(clf.coef_[i])[-20:]
            print("%s: %s" % (class_label,
                  " ".join(feature_names[j] for j in topt)))Results:
charles-m-blow: zimmerman sequester week pew thankful gallup trayvon wednesday those pointed officer president continued nearly report furthermore poll must released according
david-brooks: moral series each these few speech then self cooper he culture lewinsky percent will past kerry people sort they are
frank-bruni: ones less monday there just he zelizer whose wasn evangelical isn colorado its many or last re them gay which
gail-collins: idea since perhaps giuliani all been guy ginsburg actually totally quiz who definitely was presidential going nobody pretty everybody really
joe-nocera: luke course money caro executive thus which article though indeed gun athletes retirement detainees joe football its company instance had
maureen-dowd: noting rice mushy put up poppy wrote old who christmas adding replied cheney tuesday hillary white even president said washington
nicholas-kristof: jesus isn notes my girls often united sudan then moldova one mr sometimes year found partly also yet may likewise
paul-krugman: thing which investors mainly aren isn answer even bad large claim administration example financial declared insurance fact what however mr
roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london
ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era
thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today
Let's just do Naive Bayes and a plain old bag of words that includes only words used in at least 50% of the corpus:
import sys
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
data_folder = "./data-hold/cleaned/"
dataset = load_files(data_folder, shuffle = False)
print("n_samples: %d" % len(dataset.data))
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)
pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=0.5)),
    ('clf', MultinomialNB()),
])
pipeline.fit(docs_train, y_train)
y_predicted = pipeline.predict(docs_test)
print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))Precision metrics and then most informative features...not super accurate...yet surprisingly accurate...:
                   precision    recall  f1-score   support
   charles-m-blow       0.59      0.58      0.58        78
     david-brooks       0.78      0.61      0.68       199
      frank-bruni       0.71      0.63      0.67        75
     gail-collins       0.77      0.74      0.76       158
       joe-nocera       0.64      0.63      0.63        70
     maureen-dowd       0.57      0.74      0.65       121
 nicholas-kristof       0.84      0.75      0.79       115
     paul-krugman       0.76      0.81      0.78       153
      roger-cohen       0.60      0.73      0.66       112
     ross-douthat       0.71      0.59      0.64        61
thomas-l-friedman       0.69      0.77      0.73       121
      avg / total       0.71      0.70      0.70      1263
charles-m-blow: they we have with but be was are on this as for it is that in to and of the
david-brooks: as be with this you on have for he but they are it is that in and of to the
frank-bruni: at we be they but was is his as with on for it he in that to of and the
gail-collins: with have his we this who be you on he was it for is that and in of to the
joe-nocera: but his be has with had they on as for was he it is and in that of to the
maureen-dowd: at not be you who for as with was is his on it he that in of and to the
nicholas-kristof: by be have he was we with are on as but it for is that in of and to the
paul-krugman: with has they was this are be have as on but for it is in and that of to the
roger-cohen: an this be but he was not as has with on for it that is in and to of the
ross-douthat: was by are have this more with be on as but is it for that in to of and the
thomas-l-friedman: they you this not are be have but on with we for it is that in of to and the