ogrisel · October 1, 2011 13:37
diff --git a/output.txt b/output.txt
 %lprun -f datasets.svmlight_format._load_svmlight_file _ = datasets.load_svmlight_file('competition_data/public_train_data.svmlight.dat')

 Timer unit: 1e-06 s

 File: /home/ogrisel/coding/scikit-learn/sklearn/datasets/svmlight_format.py
 Function: _load_svmlight_file at line 21
 Total time: 58.3922 s

 Line #      Hits         Time  Per Hit   % Time  Line Contents
 ==============================================================
    21                                           def _load_svmlight_file(f, buffer_mb, n_features):
    22         1            7      7.0      0.0      data = []
    23         1            5      5.0      0.0      indptr = []
    24         1            4      4.0      0.0      indices = []
    25         1            5      5.0      0.0      labels = []
    26                                           
    27     50001       230613      4.6      0.4      for line in f:
    28     50000       141327      2.8      0.2          line = line.strip()
    29                                           
    30     50000       168408      3.4      0.3          hash_position = line.find('#')
    31     50000       104337      2.1      0.2          if hash_position == 0:
    32                                                       continue
    33     50000       100519      2.0      0.2          elif hash_position > 0:
    34                                                       line = line[:hash_position].strip()
    35                                           
    36     50000       675187     13.5      1.2          line_parts = line.split()
    37     50000       245655      4.9      0.4          y, features = line_parts[0], line_parts[1:]
    38                                           
    39     50000       147872      3.0      0.3          labels.append(float(y))
    40     50000       118925      2.4      0.2          indptr.append(len(data))
    41                                           
    42   5730286     11460625      2.0     19.6          for feat in features:
    43   5680286     14263551      2.5     24.4              idx, value = feat.split(":")
    44   5680286     15822776      2.8     27.1              indices.append(int(idx))
    45   5680286     14151570      2.5     24.2              data.append(float(value))
    46                                           
    47         1            3      3.0      0.0      indptr.append(len(data))
    48         1        17321  17321.0      0.0      indptr = np.array(indptr, dtype=np.int)
    49                                           
    50         1            5      5.0      0.0      if n_features is not None:
    51                                                   shape = (indptr.shape[0] - 1, n_features)
    52                                               else:
    53         1            2      2.0      0.0          shape = None    # inferred
    54                                           
    55         1       297895 297895.0      0.5      X = sp.csr_matrix((np.array(data, dtype=np.double),
    56         1       392693 392693.0      0.7                         np.array(indices, dtype=np.int),
    57         1        45218  45218.0      0.1                         indptr), shape)
    58                                           
    59         1         7638   7638.0      0.0      return X, np.array(labels, dtype=np.double)
	%lprun -f datasets.svmlight_format._load_svmlight_file _ = datasets.load_svmlight_file('competition_data/public_train_data.svmlight.dat')

	Timer unit: 1e-06 s

	File: /home/ogrisel/coding/scikit-learn/sklearn/datasets/svmlight_format.py
	Function: _load_svmlight_file at line 21
	Total time: 58.3922 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	21 def _load_svmlight_file(f, buffer_mb, n_features):
	22 1 7 7.0 0.0 data = []
	23 1 5 5.0 0.0 indptr = []
	24 1 4 4.0 0.0 indices = []
	25 1 5 5.0 0.0 labels = []
	26
	27 50001 230613 4.6 0.4 for line in f:
	28 50000 141327 2.8 0.2 line = line.strip()
	29
	30 50000 168408 3.4 0.3 hash_position = line.find('#')
	31 50000 104337 2.1 0.2 if hash_position == 0:
	32 continue
	33 50000 100519 2.0 0.2 elif hash_position > 0:
	34 line = line[:hash_position].strip()
	35
	36 50000 675187 13.5 1.2 line_parts = line.split()
	37 50000 245655 4.9 0.4 y, features = line_parts[0], line_parts[1:]
	38
	39 50000 147872 3.0 0.3 labels.append(float(y))
	40 50000 118925 2.4 0.2 indptr.append(len(data))
	41
	42 5730286 11460625 2.0 19.6 for feat in features:
	43 5680286 14263551 2.5 24.4 idx, value = feat.split(":")
	44 5680286 15822776 2.8 27.1 indices.append(int(idx))
	45 5680286 14151570 2.5 24.2 data.append(float(value))
	46
	47 1 3 3.0 0.0 indptr.append(len(data))
	48 1 17321 17321.0 0.0 indptr = np.array(indptr, dtype=np.int)
	49
	50 1 5 5.0 0.0 if n_features is not None:
	51 shape = (indptr.shape[0] - 1, n_features)
	52 else:
	53 1 2 2.0 0.0 shape = None # inferred
	54
	55 1 297895 297895.0 0.5 X = sp.csr_matrix((np.array(data, dtype=np.double),
	56 1 392693 392693.0 0.7 np.array(indices, dtype=np.int),
	57 1 45218 45218.0 0.1 indptr), shape)
	58
	59 1 7638 7638.0 0.0 return X, np.array(labels, dtype=np.double)