Simon Jonassen s-j

😀

Better than ever

herrfz / Reuters.py

Last active October 18, 2021 19:27

Reuters-21578 keyword extraction

	# Reuters-21578 dataset downloader and parser
	#
	# Author: Eustache Diemert <eustache@diemert.fr>
	# http://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html
	#
	# Modified by @herrfz, get pandas DataFrame from the orig SGML
	# License: BSD 3 clause

	from __future__ import print_function

miguelmalvarez / run.py

Created March 20, 2015 09:32

Represent Reuters21578

	from nltk import word_tokenize
	from nltk.corpus import reuters
	from sklearn.feature_extraction.text import TfidfVectorizer
	from nltk.stem.porter import PorterStemmer
	import re
	from nltk.corpus import stopwords

	cachedStopWords = stopwords.words("english")

	def tokenize(text):

bbengfort / sentiment.py

Last active December 27, 2022 05:17

An end-to-end demonstration of a Scikit-Learn SVM classifier trained on the positive and negative movie reviews corpus in NLTK.

	import os
	import time
	import string
	import pickle

	from operator import itemgetter

	from nltk.corpus import stopwords as sw
	from nltk.corpus import wordnet as wn
	from nltk import wordpunct_tokenize