Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Last active August 29, 2015 14:07
Show Gist options
  • Save chyikwei/e92c01bdb04450aa5b94 to your computer and use it in GitHub Desktop.
Save chyikwei/e92c01bdb04450aa5b94 to your computer and use it in GitHub Desktop.
LDA cython profiling
File: lda.py
Function: _dirichlet_expectation at line 24
Total time: 8.96912 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
24 @profile
25 def _dirichlet_expectation(alpha):
26 """
27 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
28 """
29 379947 411076 1.1 4.6 if (len(alpha.shape) == 1):
30 379940 8545062 22.5 95.3 return(psi(alpha) - psi(np.sum(alpha)))
31 7 12980 1854.3 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis])
File: lda.py
Function: _update_gamma at line 33
Total time: 37.1273 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
33 @profile
34 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
35 meanchangethresh, cal_delta):
36 """
37 E-step: update latent variable gamma
38 """
39
40 2 8 4.0 0.0 n_docs, n_vocabs = X.shape
41 2 4 2.0 0.0 n_topics = expElogbeta.shape[0]
42
43 # gamma is non-normailzed topic distribution
44 2 5032 2516.0 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
45 2 5883 2941.5 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
46 # diff on component (only calculate it when keep_comp_change is True)
47 2 70 35.0 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
48
49 2 3 1.5 0.0 X_data = X.data
50 2 2 1.0 0.0 X_indices = X.indices
51 2 2 1.0 0.0 X_indptr = X.indptr
52
53 8002 12721 1.6 0.0 for d in xrange(n_docs):
54 8000 25173 3.1 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
55 8000 19870 2.5 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
56 8000 30900 3.9 0.1 gammad = gamma[d, :]
57 8000 26641 3.3 0.1 expElogthetad = expElogtheta[d, :]
58 8000 104626 13.1 0.3 expElogbetad = expElogbeta[:, ids]
59 # The optimal phi_{dwk} is proportional to
60 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
61 8000 79777 10.0 0.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
62
63 # Iterate between gamma and phi until convergence
64 381325 467124 1.2 1.3 for it in xrange(0, max_iters):
65 379940 565605 1.5 1.5 lastgamma = gammad
66 # We represent phi implicitly to save memory and time.
67 # Substituting the value of the optimal phi back into
68 # the update for gamma gives this update. Cf. Lee&Seung 2001.
69 379940 428819 1.1 1.2 gammad = alpha + expElogthetad * \
70 379940 5605904 14.8 15.1 np.dot(cnts / phinorm, expElogbetad.T)
71 379940 12712990 33.5 34.2 expElogthetad = np.exp(_dirichlet_expectation(gammad))
72 379940 3375137 8.9 9.1 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
73
74 379940 12524287 33.0 33.7 meanchange = np.mean(abs(gammad - lastgamma))
75 379940 620657 1.6 1.7 if (meanchange < meanchangethresh):
76 6615 8065 1.2 0.0 break
77 8000 50140 6.3 0.1 gamma[d, :] = gammad
78 # Contribution of document d to the expected sufficient
79 # statistics for the M step.
80 8000 9904 1.2 0.0 if cal_delta:
81 8000 447906 56.0 1.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
82
83 2 3 1.5 0.0 return (gamma, delta_component)
File: lda.py
Function: _dirichlet_expectation at line 26
Total time: 8.93651 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
26 @profile
27 def _dirichlet_expectation(alpha):
28 """
29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
30 """
31 379947 391754 1.0 4.4 if (len(alpha.shape) == 1):
32 379940 8532031 22.5 95.5 return(psi(alpha) - psi(np.sum(alpha)))
33 7 12729 1818.4 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis])
File: lda.py
Function: _update_gamma at line 35
Total time: 25.8925 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
35 @profile
36 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
37 meanchangethresh, cal_delta):
38 """
39 E-step: update latent variable gamma
40 """
41
42 2 8 4.0 0.0 n_docs, n_vocabs = X.shape
43 2 5 2.5 0.0 n_topics = expElogbeta.shape[0]
44
45 # gamma is non-normailzed topic distribution
46 2 4931 2465.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
47 2 5778 2889.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
48 # diff on component (only calculate it when keep_comp_change is True)
49 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
50
51 2 4 2.0 0.0 X_data = X.data
52 2 3 1.5 0.0 X_indices = X.indices
53 2 2 1.0 0.0 X_indptr = X.indptr
54
55 8002 12479 1.6 0.0 for d in xrange(n_docs):
56 8000 26147 3.3 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
57 8000 21494 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
58 8000 37107 4.6 0.1 gammad = gamma[d, :]
59 8000 29111 3.6 0.1 expElogthetad = expElogtheta[d, :]
60 8000 99660 12.5 0.4 expElogbetad = expElogbeta[:, ids]
61 # The optimal phi_{dwk} is proportional to
62 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
63 8000 79255 9.9 0.3 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
64
65 # Iterate between gamma and phi until convergence
66 381325 473084 1.2 1.8 for it in xrange(0, max_iters):
67 379940 771424 2.0 3.0 lastgamma = gammad
68 # We represent phi implicitly to save memory and time.
69 # Substituting the value of the optimal phi back into
70 # the update for gamma gives this update. Cf. Lee&Seung 2001.
71 379940 453302 1.2 1.8 gammad = alpha + expElogthetad * \
72 379940 5402250 14.2 20.9 np.dot(cnts / phinorm, expElogbetad.T)
73 379940 12609292 33.2 48.7 expElogthetad = np.exp(_dirichlet_expectation(gammad))
74 379940 3407782 9.0 13.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
75
76 379940 1417461 3.7 5.5 meanchange = mean_change(lastgamma, gammad)
77 379940 532688 1.4 2.1 if (meanchange < meanchangethresh):
78 6615 8396 1.3 0.0 break
79 8000 50124 6.3 0.2 gamma[d, :] = gammad
80 # Contribution of document d to the expected sufficient
81 # statistics for the M step.
82 8000 10207 1.3 0.0 if cal_delta:
83 8000 440468 55.1 1.7 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
84
85 2 3 1.5 0.0 return (gamma, delta_component)
File: lda.py
Function: _dirichlet_expectation at line 26
Total time: 3.92028 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
26 @profile
27 def _dirichlet_expectation(alpha):
28 """
29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha.
30 """
31 379947 391707 1.0 10.0 if (len(alpha.shape) == 1):
32 379940 3197582 8.4 81.6 ret = _dirichlet_expectation_1d(alpha)
33 else:
34 7 14893 2127.6 0.4 ret = _dirichlet_expectation_2d(alpha)
35 379947 316096 0.8 8.1 return ret
File: lda.py
Function: _update_gamma at line 38
Total time: 21.0102 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
38 @profile
39 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
40 meanchangethresh, cal_delta):
41 """
42 E-step: update latent variable gamma
43 """
44
45 2 8 4.0 0.0 n_docs, n_vocabs = X.shape
46 2 4 2.0 0.0 n_topics = expElogbeta.shape[0]
47
48 # gamma is non-normailzed topic distribution
49 2 4959 2479.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
50 2 5692 2846.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
51 # diff on component (only calculate it when keep_comp_change is True)
52 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
53
54 2 4 2.0 0.0 X_data = X.data
55 2 3 1.5 0.0 X_indices = X.indices
56 2 2 1.0 0.0 X_indptr = X.indptr
57
58 8002 12836 1.6 0.1 for d in xrange(n_docs):
59 8000 26909 3.4 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
60 8000 21216 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
61 8000 36913 4.6 0.2 gammad = gamma[d, :]
62 8000 28754 3.6 0.1 expElogthetad = expElogtheta[d, :]
63 8000 106489 13.3 0.5 expElogbetad = expElogbeta[:, ids]
64 # The optimal phi_{dwk} is proportional to
65 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
66 8000 80909 10.1 0.4 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
67
68 # Iterate between gamma and phi until convergence
69 381325 466912 1.2 2.2 for it in xrange(0, max_iters):
70 379940 785902 2.1 3.7 lastgamma = gammad
71 # We represent phi implicitly to save memory and time.
72 # Substituting the value of the optimal phi back into
73 # the update for gamma gives this update. Cf. Lee&Seung 2001.
74 379940 455282 1.2 2.2 gammad = alpha + expElogthetad * \
75 379940 5387921 14.2 25.6 np.dot(cnts / phinorm, expElogbetad.T)
76 379940 7855665 20.7 37.4 expElogthetad = np.exp(_dirichlet_expectation(gammad))
77 379940 3367420 8.9 16.0 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
78
79 379940 1301467 3.4 6.2 meanchange = mean_change(lastgamma, gammad)
80 379940 542443 1.4 2.6 if (meanchange < meanchangethresh):
81 6615 8323 1.3 0.0 break
82 8000 50913 6.4 0.2 gamma[d, :] = gammad
83 # Contribution of document d to the expected sufficient
84 # statistics for the M step.
85 8000 10525 1.3 0.1 if cal_delta:
86 8000 452723 56.6 2.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
87
88 2 3 1.5 0.0 return (gamma, delta_component)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment