ogrisel · October 2, 2011 19:46 · larsmans · Oct 3, 2011 · ogrisel · Oct 3, 2011
diff --git a/profiler_output.txt b/profiler_output.txt
 Timer unit: 1e-06 s

 File: sklearn/feature_extraction/text.py
 Function: fit_transform at line 290
 Total time: 16.3795 s

 Line #      Hits         Time  Per Hit   % Time  Line Contents
 ==============================================================
   290                                               def fit_transform(self, raw_documents, y=None):
   291                                                   """Learn the vocabulary dictionary and return the count vectors
   292                                           
   293                                                   This is more efficient than calling fit followed by transform.
   294                                           
   295                                                   Parameters
   296                                                   ----------
   297                                                   raw_documents: iterable
   298                                                       an iterable which yields either str, unicode or file objects
   299                                           
   300                                                   Returns
   301                                                   -------
   302                                                   vectors: array, [n_samples, n_features]
   303                                                   """
   304         1            9      9.0      0.0          if not self.fit_vocabulary:
   305                                                       return self.transform(raw_documents)
   306                                           
   307                                                   # result of document conversion to term count dicts
   308         1            8      8.0      0.0          term_counts_per_doc = []
   309         1           32     32.0      0.0          term_counts = Counter()
   310                                           
   311                                                   # term counts across entire corpus (count each term maximum once per
   312                                                   # document)
   313         1           21     21.0      0.0          document_counts = Counter()
   314                                           
   315         1            6      6.0      0.0          max_df = self.max_df
   316         1            7      7.0      0.0          max_features = self.max_features
   317                                           
   318                                                   # TODO: parallelize the following loop with joblib?
   319                                                   # (see XXX up ahead)
   320       501         1849      3.7      0.0          for doc in raw_documents:
   321       500       391586    783.2      2.4              term_count_current = Counter(self.analyzer.analyze(doc))
   322       500     15772081  31544.2     96.3              term_counts += term_count_current
   323                                           
   324       500         3185      6.4      0.0              if max_df < 1.0:
   325                                                           document_counts.update(term_count_current)
   326                                           
   327       500         2347      4.7      0.0              term_counts_per_doc.append(term_count_current)
   328                                           
   329         1            4      4.0      0.0          n_doc = len(term_counts_per_doc)
   330                                           
   331                                                   # filter out stop words: terms that occur in almost all documents

   332         1            2      2.0      0.0          if max_df < 1.0:
   333                                                       max_document_count = max_df * n_doc
   334                                                       stop_words = set(t for t, dc in document_counts.iteritems()
   335                                                                          if dc > max_document_count)
   336                                                   else:
   337         1            5      5.0      0.0              stop_words = set()
   338                                           
   339                                                   # list the terms that should be part of the vocabulary
   340         1            3      3.0      0.0          if max_features is None:
   341         1         4668   4668.0      0.0              terms = set(term_counts) - stop_words
   342                                                   else:
   343                                                       # extract the most frequent terms for the vocabulary
   344                                                       terms = set()
   345                                                       for t, tc in term_counts.most_common():
   346                                                           if t not in stop_words:
   347                                                               terms.add(t)
   348                                                           if len(terms) >= max_features:
   349                                                               break
   350                                           
   351                                                   # convert to a document-token matrix
   352         1        13323  13323.0      0.1          self.vocabulary = dict(((t, i) for i, t in enumerate(terms)))
   353                                           
   354                                                   # the term_counts and document_counts might be useful statistics, are
   355                                                   # we really sure want we want to drop them? They take some memory but
   356                                                   # can be useful for corpus introspection
   357                                           
   358         1       190330 190330.0      1.2          return self._term_count_dicts_to_matrix(term_counts_per_doc)
	Timer unit: 1e-06 s

	File: sklearn/feature_extraction/text.py
	Function: fit_transform at line 290
	Total time: 16.3795 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	290 def fit_transform(self, raw_documents, y=None):
	291 """Learn the vocabulary dictionary and return the count vectors
	292
	293 This is more efficient than calling fit followed by transform.
	294
	295 Parameters
	296 ----------
	297 raw_documents: iterable
	298 an iterable which yields either str, unicode or file objects
	299
	300 Returns
	301 -------
	302 vectors: array, [n_samples, n_features]
	303 """
	304 1 9 9.0 0.0 if not self.fit_vocabulary:
	305 return self.transform(raw_documents)
	306
	307 # result of document conversion to term count dicts
	308 1 8 8.0 0.0 term_counts_per_doc = []
	309 1 32 32.0 0.0 term_counts = Counter()
	310
	311 # term counts across entire corpus (count each term maximum once per
	312 # document)
	313 1 21 21.0 0.0 document_counts = Counter()
	314
	315 1 6 6.0 0.0 max_df = self.max_df
	316 1 7 7.0 0.0 max_features = self.max_features
	317
	318 # TODO: parallelize the following loop with joblib?
	319 # (see XXX up ahead)
	320 501 1849 3.7 0.0 for doc in raw_documents:
	321 500 391586 783.2 2.4 term_count_current = Counter(self.analyzer.analyze(doc))
	322 500 15772081 31544.2 96.3 term_counts += term_count_current
	323
	324 500 3185 6.4 0.0 if max_df < 1.0:
	325 document_counts.update(term_count_current)
	326
	327 500 2347 4.7 0.0 term_counts_per_doc.append(term_count_current)
	328
	329 1 4 4.0 0.0 n_doc = len(term_counts_per_doc)
	330
	331 # filter out stop words: terms that occur in almost all documents

	332 1 2 2.0 0.0 if max_df < 1.0:
	333 max_document_count = max_df * n_doc
	334 stop_words = set(t for t, dc in document_counts.iteritems()
	335 if dc > max_document_count)
	336 else:
	337 1 5 5.0 0.0 stop_words = set()
	338
	339 # list the terms that should be part of the vocabulary
	340 1 3 3.0 0.0 if max_features is None:
	341 1 4668 4668.0 0.0 terms = set(term_counts) - stop_words
	342 else:
	343 # extract the most frequent terms for the vocabulary
	344 terms = set()
	345 for t, tc in term_counts.most_common():
	346 if t not in stop_words:
	347 terms.add(t)
	348 if len(terms) >= max_features:
	349 break
	350
	351 # convert to a document-token matrix
	352 1 13323 13323.0 0.1 self.vocabulary = dict(((t, i) for i, t in enumerate(terms)))
	353
	354 # the term_counts and document_counts might be useful statistics, are
	355 # we really sure want we want to drop them? They take some memory but
	356 # can be useful for corpus introspection
	357
	358 1 190330 190330.0 1.2 return self._term_count_dicts_to_matrix(term_counts_per_doc)
No results found