spiderChow · October 21, 2018 01:45
diff --git a/Feature Extraction.py b/Feature Extraction.py

 '''
 From text
 # + The sklearn.feature_extraction.text submodule gathers utilities to build feature vectors from text documents.
 # > **feature_extraction.text.CountVectorizer([…])**	Convert a collection of text documents to a matrix of token counts  
 # > **feature_extraction.text.HashingVectorizer([…])**	Convert a collection of text documents to a matrix of token occurrences  
 # > **feature_extraction.text.TfidfTransformer([…])**	Transform a count matrix to a normalized tf or tf-idf representation  
 # > **feature_extraction.text.TfidfVectorizer([…])**	Convert a collection of raw documents to a matrix of TF-IDF features.  
 '''

 ## **CountVectorizer**
 # Convert a collection of text documents to a matrix of token counts  
 # http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
 from sklearn.feature_extraction.text import CountVectorizer
 vectorizer = CountVectorizer()
 corpus = [
  'This is the first document.',
  'This is the second second document.',
  'And the third one.',
  'Is this the first document?',
 ]
 X = vectorizer.fit_transform(corpus)
 vectorizer.get_feature_names()
 X.toarray()   

 ## TfidfTransformer
 from sklearn.feature_extraction.text import TfidfTransformer
 transformer = TfidfTransformer(smooth_idf=False)

 ##  TfidfVectorizer that combines all the options of CountVectorizer and TfidfTransformer in a single model:
 from sklearn.feature_extraction.text import TfidfVectorizer
 vectorizer = TfidfVectorizer()
 vectorizer.fit_transform(corpus)

	'''
	From text
	# + The sklearn.feature_extraction.text submodule gathers utilities to build feature vectors from text documents.
	# > feature_extraction.text.CountVectorizer([…]) Convert a collection of text documents to a matrix of token counts
	# > feature_extraction.text.HashingVectorizer([…]) Convert a collection of text documents to a matrix of token occurrences
	# > feature_extraction.text.TfidfTransformer([…]) Transform a count matrix to a normalized tf or tf-idf representation
	# > feature_extraction.text.TfidfVectorizer([…]) Convert a collection of raw documents to a matrix of TF-IDF features.
	'''

	## CountVectorizer
	# Convert a collection of text documents to a matrix of token counts
	# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
	from sklearn.feature_extraction.text import CountVectorizer
	vectorizer = CountVectorizer()
	corpus = [
	'This is the first document.',
	'This is the second second document.',
	'And the third one.',
	'Is this the first document?',
	]
	X = vectorizer.fit_transform(corpus)
	vectorizer.get_feature_names()
	X.toarray()

	## TfidfTransformer
	from sklearn.feature_extraction.text import TfidfTransformer
	transformer = TfidfTransformer(smooth_idf=False)

	## TfidfVectorizer that combines all the options of CountVectorizer and TfidfTransformer in a single model:
	from sklearn.feature_extraction.text import TfidfVectorizer
	vectorizer = TfidfVectorizer()
	vectorizer.fit_transform(corpus)
No results found