dannguyen · February 10, 2022 16:45
diff --git a/easypipe.py b/easypipe.py
 # some convenience functions here, nothing new
 '''
 # usage:
 from easypipe import easy_pipeline
 from easypipe import print_metrics
 data_folder = "data-hold/20news"
 p = easy_pipeline()
 print_metrics(p, data_folder)
 '''


 import sys

 from sklearn.datasets import load_files
 from sklearn.cross_validation import train_test_split
 from sklearn import metrics
 from sklearn.pipeline import Pipeline
 from sklearn.svm import LinearSVC
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy


 ## make a Multinomial-NB and CountVectorized pipeline by default
 ## or switch in tf-idf as your vectorizer, and LinearSVC for your classifier
 def easy_pipeline(vect = 'tfidf', clf = 'mnb', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
  if vect == 'tfidf':
    V = TfidfVectorizer
  else:
    V = CountVectorizer

  if clf == 'lsvc':
    C = LinearSVC
  else:
    C = MultinomialNB


  pipeline = Pipeline([
      ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)),
      ('clf', C()),
  ])
  return pipeline

 ## Print the precision/recall/F1 numbers per label, and also
 ##  the top-10 most informative features per label
 def print_metrics(pipeline, data_folder, test_size = 0.25):
  dataset = load_files(data_folder, shuffle = False)
  docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size = test_size, random_state=None)
  pipeline.fit(docs_train, y_train)
  y_predicted = pipeline.predict(docs_test)
  # print report
  print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
  ## print out top 10 words
  clf = pipeline.steps[1][1]
  vect = pipeline.steps[0][1]
  for i, class_label in enumerate(dataset.target_names):
              topt = numpy.argsort(clf.coef_[i])[-10:]
              print("%s:    %s" % (class_label,
                    ", ".join(vect.get_feature_names()[j] for j in topt)))

diff --git a/more-nb-bag-of-words.py b/more-nb-bag-of-words.py
 # Using just CountVectorizer (threshold of 0.05 to 0.75) and Multinomial-Naive Bayes

 import sys
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.datasets import load_files
 from sklearn.cross_validation import train_test_split
 from sklearn import metrics
 from sklearn.pipeline import Pipeline
 import numpy

 data_folder = "./data-hold/cleaned/"
 dataset = load_files(data_folder, shuffle = False)

 docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)

 pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=0.05, max_df=0.75)),
    ('clf', MultinomialNB()),
 ])

 pipeline.fit(docs_train, y_train)
 y_predicted = pipeline.predict(docs_test)
 print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))

 ################ Performance report
 #                    precision    recall  f1-score   support

 #    charles-m-blow       0.87      0.71      0.78        82
 #      david-brooks       0.93      0.75      0.83       187
 #       frank-bruni       0.69      0.83      0.76        60
 #      gail-collins       0.84      0.89      0.87       169
 #        joe-nocera       0.80      0.83      0.81        81
 #      maureen-dowd       0.76      0.88      0.81       128
 #  nicholas-kristof       0.84      0.83      0.83       123
 #      paul-krugman       0.90      0.91      0.90       154
 #       roger-cohen       0.80      0.86      0.83       115
 #      ross-douthat       0.77      0.71      0.74        48
 # thomas-l-friedman       0.87      0.87      0.87       116

 #       avg / total       0.84      0.83      0.83      1263




 ## print out top 10 most informative features
 clf = pipeline.steps[1][1]
 vect = pipeline.steps[0][1]
 for i, class_label in enumerate(dataset.target_names):
            topt = numpy.argsort(clf.coef_[i])[-10:]
            print("%s:    %s" % (class_label,
                  ", ".join(vect.get_feature_names()[j] for j in topt)))



 # charles-m-blow:    republicans, said, some, our, most, those, were, obama, president, percent
 # david-brooks:    new, government, now, them, over, some, these, do, were, obama
 # frank-bruni:    last, many, just, re, them, had, him, said, her, she
 # gail-collins:    get, state, mr, do, her, were, she, said, new, had
 # joe-nocera:    do, them, years, other, said, she, new, were, its, had
 # maureen-dowd:    hillary, even, were, him, had, president, said, her, obama, she
 # nicholas-kristof:    my, also, some, because, our, year, said, had, her, she
 # paul-krugman:    government, were, health, much, obama, economic, economy, now, even, mr
 # roger-cohen:    had, states, united, american, now, israel, world, iran, obama, its
 # ross-douthat:    well, because, new, many, even, just, party, our, its, obama
 # thomas-l-friedman:    president, america, them, how, its, now, just, do, world, our

diff --git a/sample-dowd-column.txt b/sample-dowd-column.txt
 WASHINGTON
 It’s a lost art, slinking away.
 Now the fashion is slinking back.
 Nobody wants to simply admit they made a mistake and disappear for awhile. Nobody even wants to use the weasel words: “Mistakes were made.” No, far better to pop right back up and get in the face of those who were savoring your absence.
 We should think of a name for this appalling modern phenomenon. Kissingering, perhaps.
 In Las Vegas, there’s the loathsome O.J., a proper candidate for shunning and stun-gunning, barging back into the picture.
 And on Capitol Hill, Larry Craig shocked mortified Republicans by bounding into their weekly lunch. You’d think the conservative 62-year-old Idaho senator would have some shame, going from fervently opposing gay rights to provocatively tapping his toe in a Minneapolis airport toilet. (The toilet stall, now known as the Larry Craig bathroom, has become a hot local tourist attraction.)
 But no.
 As though Republicans don’t have enough problems, Mr. Craig said he is ready to go back to work while the legal hotshots he hired appeal his case. He even cast a couple votes, one against D.C. voting rights. (This creep gets to decide about my representation?)
 Even if President Bush is “the cockiest guy” around, as the former Mexican President Vicente Fox writes in a new memoir critical of W.’s “grade-school-level” Spanish and his grade-school-level Iraq policy, he can’t be feeling good about the barbs being hurled his way by former supporters and enablers.
 Rummy’s back in the news, giving interviews about a planned memoir and foundation designed to encourage “reasoned and civil debate” about global challenges and to spur more young people to go into government.
 It’s rich. Maybe more young people would go into government if they didn’t have to work for devious bullies like Rummy who make huge life-and-death mistakes and then don’t apologize.
 In The Washington Post, he blamed the press and Congress for creating an inhospitable atmosphere that drives good people away from public service. Maybe that’s why he and his evil twin, Dick Cheney, did their best to undermine the constitutional system of checks and balances so they could get more fine young people to serve.
 Does the man blamed for creating civil disorder in Iraq even know what the word “civil” means? Wasn’t he the prickly Pentagon chief who got furious with anyone who didn’t agree with him on “global challenges”?
 He shoved Gen. Eric Shinseki into retirement — and failed to show up at his retirement party — after the good general correctly told Congress that it would take several hundred thousand troops to invade and control Iraq. And he snubbed the German defense minister when Germany joined the Coalition of the Unwilling.
 Interviewed by GQ’s Lisa DePaulo on his ranch in Taos, N.M., with another mule named Gus nearby, the “75-year-old package of waning testosterone,” as the writer called him, was asked if he misses W. Offering a wry smile, he replied, “Um, no.”
 He now treats the son with the same contempt he treated the father with, which is why it’s so odd that the son hired his dad’s nemesis in the first place.
 He actually had the gall to imply to Ms. DePaulo that he was out of the loop on Iraq and dragged out a copy of a memo he had written outlining all the things that could go wrong.
 In fact, he was the one, right after 9/11, who began pushing to go after Saddam. He and Cheney were orchestrating the invasion from the start, guiding the dauphin with warnings about how weak he would seem if he let Saddam mock him.
 The ultimate bureaucratic infighter wrote the memo as part of his Socratic strategy, asking a lot of questions when he was already pushing to go into Iraq. He never did any contingency planning in case those things went wrong; the memo was there simply so that someday he could pull it out for a reporter.
 In the same issue of GQ, Colin Powell tried to build up the objections he made to the president, too, in an interview with Walter Isaacson. But nobody’s buying.
 Even though he rubber-stamped W.’s tax cuts, Alan Greenspan is now upbraiding the president and vice president for profligate spending and putting politics ahead of sound economics.
 He also says in his new memoir that “the Iraq war is largely about oil,” telling Bob Woodward that he had privately told W. and Cheney that ousting Saddam was “essential” to keeping world oil supplies safe.
 Irrational exuberance, indeed.
diff --git a/scikit-learn-basic-nytcol.md b/scikit-learn-basic-nytcol.md
	# some convenience functions here, nothing new
	'''
	# usage:
	from easypipe import easy_pipeline
	from easypipe import print_metrics
	data_folder = "data-hold/20news"
	p = easy_pipeline()
	print_metrics(p, data_folder)
	'''


	import sys

	from sklearn.datasets import load_files
	from sklearn.cross_validation import train_test_split
	from sklearn import metrics
	from sklearn.pipeline import Pipeline
	from sklearn.svm import LinearSVC
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	import numpy


	## make a Multinomial-NB and CountVectorized pipeline by default
	## or switch in tf-idf as your vectorizer, and LinearSVC for your classifier
	def easy_pipeline(vect = 'tfidf', clf = 'mnb', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
	if vect == 'tfidf':
	V = TfidfVectorizer
	else:
	V = CountVectorizer

	if clf == 'lsvc':
	C = LinearSVC
	else:
	C = MultinomialNB


	pipeline = Pipeline([
	('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)),
	('clf', C()),
	])
	return pipeline

	## Print the precision/recall/F1 numbers per label, and also
	## the top-10 most informative features per label
	def print_metrics(pipeline, data_folder, test_size = 0.25):
	dataset = load_files(data_folder, shuffle = False)
	docs_train, docs_test, y_train, y_test = train_test_split(
	dataset.data, dataset.target, test_size = test_size, random_state=None)
	pipeline.fit(docs_train, y_train)
	y_predicted = pipeline.predict(docs_test)
	# print report
	print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
	## print out top 10 words
	clf = pipeline.steps[1][1]
	vect = pipeline.steps[0][1]
	for i, class_label in enumerate(dataset.target_names):
	topt = numpy.argsort(clf.coef_[i])[-10:]
	print("%s: %s" % (class_label,
	", ".join(vect.get_feature_names()[j] for j in topt)))
	# Using just CountVectorizer (threshold of 0.05 to 0.75) and Multinomial-Naive Bayes

	import sys
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.datasets import load_files
	from sklearn.cross_validation import train_test_split
	from sklearn import metrics
	from sklearn.pipeline import Pipeline
	import numpy

	data_folder = "./data-hold/cleaned/"
	dataset = load_files(data_folder, shuffle = False)

	docs_train, docs_test, y_train, y_test = train_test_split(
	dataset.data, dataset.target, test_size=0.25, random_state=None)

	pipeline = Pipeline([
	('vect', CountVectorizer(min_df=0.05, max_df=0.75)),
	('clf', MultinomialNB()),
	])

	pipeline.fit(docs_train, y_train)
	y_predicted = pipeline.predict(docs_test)
	print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))

	################ Performance report
	# precision recall f1-score support

	# charles-m-blow 0.87 0.71 0.78 82
	# david-brooks 0.93 0.75 0.83 187
	# frank-bruni 0.69 0.83 0.76 60
	# gail-collins 0.84 0.89 0.87 169
	# joe-nocera 0.80 0.83 0.81 81
	# maureen-dowd 0.76 0.88 0.81 128
	# nicholas-kristof 0.84 0.83 0.83 123
	# paul-krugman 0.90 0.91 0.90 154
	# roger-cohen 0.80 0.86 0.83 115
	# ross-douthat 0.77 0.71 0.74 48
	# thomas-l-friedman 0.87 0.87 0.87 116

	# avg / total 0.84 0.83 0.83 1263




	## print out top 10 most informative features
	clf = pipeline.steps[1][1]
	vect = pipeline.steps[0][1]
	for i, class_label in enumerate(dataset.target_names):
	topt = numpy.argsort(clf.coef_[i])[-10:]
	print("%s: %s" % (class_label,
	", ".join(vect.get_feature_names()[j] for j in topt)))



	# charles-m-blow: republicans, said, some, our, most, those, were, obama, president, percent
	# david-brooks: new, government, now, them, over, some, these, do, were, obama
	# frank-bruni: last, many, just, re, them, had, him, said, her, she
	# gail-collins: get, state, mr, do, her, were, she, said, new, had
	# joe-nocera: do, them, years, other, said, she, new, were, its, had
	# maureen-dowd: hillary, even, were, him, had, president, said, her, obama, she
	# nicholas-kristof: my, also, some, because, our, year, said, had, her, she
	# paul-krugman: government, were, health, much, obama, economic, economy, now, even, mr
	# roger-cohen: had, states, united, american, now, israel, world, iran, obama, its
	# ross-douthat: well, because, new, many, even, just, party, our, its, obama
	# thomas-l-friedman: president, america, them, how, its, now, just, do, world, our
	WASHINGTON
	It’s a lost art, slinking away.
	Now the fashion is slinking back.
	Nobody wants to simply admit they made a mistake and disappear for awhile. Nobody even wants to use the weasel words: “Mistakes were made.” No, far better to pop right back up and get in the face of those who were savoring your absence.
	We should think of a name for this appalling modern phenomenon. Kissingering, perhaps.
	In Las Vegas, there’s the loathsome O.J., a proper candidate for shunning and stun-gunning, barging back into the picture.
	And on Capitol Hill, Larry Craig shocked mortified Republicans by bounding into their weekly lunch. You’d think the conservative 62-year-old Idaho senator would have some shame, going from fervently opposing gay rights to provocatively tapping his toe in a Minneapolis airport toilet. (The toilet stall, now known as the Larry Craig bathroom, has become a hot local tourist attraction.)
	But no.
	As though Republicans don’t have enough problems, Mr. Craig said he is ready to go back to work while the legal hotshots he hired appeal his case. He even cast a couple votes, one against D.C. voting rights. (This creep gets to decide about my representation?)
	Even if President Bush is “the cockiest guy” around, as the former Mexican President Vicente Fox writes in a new memoir critical of W.’s “grade-school-level” Spanish and his grade-school-level Iraq policy, he can’t be feeling good about the barbs being hurled his way by former supporters and enablers.
	Rummy’s back in the news, giving interviews about a planned memoir and foundation designed to encourage “reasoned and civil debate” about global challenges and to spur more young people to go into government.
	It’s rich. Maybe more young people would go into government if they didn’t have to work for devious bullies like Rummy who make huge life-and-death mistakes and then don’t apologize.
	In The Washington Post, he blamed the press and Congress for creating an inhospitable atmosphere that drives good people away from public service. Maybe that’s why he and his evil twin, Dick Cheney, did their best to undermine the constitutional system of checks and balances so they could get more fine young people to serve.
	Does the man blamed for creating civil disorder in Iraq even know what the word “civil” means? Wasn’t he the prickly Pentagon chief who got furious with anyone who didn’t agree with him on “global challenges”?
	He shoved Gen. Eric Shinseki into retirement — and failed to show up at his retirement party — after the good general correctly told Congress that it would take several hundred thousand troops to invade and control Iraq. And he snubbed the German defense minister when Germany joined the Coalition of the Unwilling.
	Interviewed by GQ’s Lisa DePaulo on his ranch in Taos, N.M., with another mule named Gus nearby, the “75-year-old package of waning testosterone,” as the writer called him, was asked if he misses W. Offering a wry smile, he replied, “Um, no.”
	He now treats the son with the same contempt he treated the father with, which is why it’s so odd that the son hired his dad’s nemesis in the first place.
	He actually had the gall to imply to Ms. DePaulo that he was out of the loop on Iraq and dragged out a copy of a memo he had written outlining all the things that could go wrong.
	In fact, he was the one, right after 9/11, who began pushing to go after Saddam. He and Cheney were orchestrating the invasion from the start, guiding the dauphin with warnings about how weak he would seem if he let Saddam mock him.
	The ultimate bureaucratic infighter wrote the memo as part of his Socratic strategy, asking a lot of questions when he was already pushing to go into Iraq. He never did any contingency planning in case those things went wrong; the memo was there simply so that someday he could pull it out for a reporter.
	In the same issue of GQ, Colin Powell tried to build up the objections he made to the president, too, in an interview with Walter Isaacson. But nobody’s buying.
	Even though he rubber-stamped W.’s tax cuts, Alan Greenspan is now upbraiding the president and vice president for profligate spending and putting politics ahead of sound economics.
	He also says in his new memoir that “the Iraq war is largely about oil,” telling Bob Woodward that he had privately told W. and Cheney that ousting Saddam was “essential” to keeping world oil supplies safe.
	Irrational exuberance, indeed.