Last active
August 29, 2015 14:07
-
-
Save chyikwei/e92c01bdb04450aa5b94 to your computer and use it in GitHub Desktop.
LDA cython profiling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: lda.py | |
Function: _dirichlet_expectation at line 24 | |
Total time: 8.96912 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
24 @profile | |
25 def _dirichlet_expectation(alpha): | |
26 """ | |
27 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha. | |
28 """ | |
29 379947 411076 1.1 4.6 if (len(alpha.shape) == 1): | |
30 379940 8545062 22.5 95.3 return(psi(alpha) - psi(np.sum(alpha))) | |
31 7 12980 1854.3 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]) | |
File: lda.py | |
Function: _update_gamma at line 33 | |
Total time: 37.1273 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
33 @profile | |
34 def _update_gamma(X, expElogbeta, alpha, rng, max_iters, | |
35 meanchangethresh, cal_delta): | |
36 """ | |
37 E-step: update latent variable gamma | |
38 """ | |
39 | |
40 2 8 4.0 0.0 n_docs, n_vocabs = X.shape | |
41 2 4 2.0 0.0 n_topics = expElogbeta.shape[0] | |
42 | |
43 # gamma is non-normailzed topic distribution | |
44 2 5032 2516.0 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics)) | |
45 2 5883 2941.5 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma)) | |
46 # diff on component (only calculate it when keep_comp_change is True) | |
47 2 70 35.0 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None | |
48 | |
49 2 3 1.5 0.0 X_data = X.data | |
50 2 2 1.0 0.0 X_indices = X.indices | |
51 2 2 1.0 0.0 X_indptr = X.indptr | |
52 | |
53 8002 12721 1.6 0.0 for d in xrange(n_docs): | |
54 8000 25173 3.1 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]] | |
55 8000 19870 2.5 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]] | |
56 8000 30900 3.9 0.1 gammad = gamma[d, :] | |
57 8000 26641 3.3 0.1 expElogthetad = expElogtheta[d, :] | |
58 8000 104626 13.1 0.3 expElogbetad = expElogbeta[:, ids] | |
59 # The optimal phi_{dwk} is proportional to | |
60 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. | |
61 8000 79777 10.0 0.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 | |
62 | |
63 # Iterate between gamma and phi until convergence | |
64 381325 467124 1.2 1.3 for it in xrange(0, max_iters): | |
65 379940 565605 1.5 1.5 lastgamma = gammad | |
66 # We represent phi implicitly to save memory and time. | |
67 # Substituting the value of the optimal phi back into | |
68 # the update for gamma gives this update. Cf. Lee&Seung 2001. | |
69 379940 428819 1.1 1.2 gammad = alpha + expElogthetad * \ | |
70 379940 5605904 14.8 15.1 np.dot(cnts / phinorm, expElogbetad.T) | |
71 379940 12712990 33.5 34.2 expElogthetad = np.exp(_dirichlet_expectation(gammad)) | |
72 379940 3375137 8.9 9.1 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 | |
73 | |
74 379940 12524287 33.0 33.7 meanchange = np.mean(abs(gammad - lastgamma)) | |
75 379940 620657 1.6 1.7 if (meanchange < meanchangethresh): | |
76 6615 8065 1.2 0.0 break | |
77 8000 50140 6.3 0.1 gamma[d, :] = gammad | |
78 # Contribution of document d to the expected sufficient | |
79 # statistics for the M step. | |
80 8000 9904 1.2 0.0 if cal_delta: | |
81 8000 447906 56.0 1.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm) | |
82 | |
83 2 3 1.5 0.0 return (gamma, delta_component) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: lda.py | |
Function: _dirichlet_expectation at line 26 | |
Total time: 8.93651 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
26 @profile | |
27 def _dirichlet_expectation(alpha): | |
28 """ | |
29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha. | |
30 """ | |
31 379947 391754 1.0 4.4 if (len(alpha.shape) == 1): | |
32 379940 8532031 22.5 95.5 return(psi(alpha) - psi(np.sum(alpha))) | |
33 7 12729 1818.4 0.1 return(psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]) | |
File: lda.py | |
Function: _update_gamma at line 35 | |
Total time: 25.8925 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
35 @profile | |
36 def _update_gamma(X, expElogbeta, alpha, rng, max_iters, | |
37 meanchangethresh, cal_delta): | |
38 """ | |
39 E-step: update latent variable gamma | |
40 """ | |
41 | |
42 2 8 4.0 0.0 n_docs, n_vocabs = X.shape | |
43 2 5 2.5 0.0 n_topics = expElogbeta.shape[0] | |
44 | |
45 # gamma is non-normailzed topic distribution | |
46 2 4931 2465.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics)) | |
47 2 5778 2889.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma)) | |
48 # diff on component (only calculate it when keep_comp_change is True) | |
49 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None | |
50 | |
51 2 4 2.0 0.0 X_data = X.data | |
52 2 3 1.5 0.0 X_indices = X.indices | |
53 2 2 1.0 0.0 X_indptr = X.indptr | |
54 | |
55 8002 12479 1.6 0.0 for d in xrange(n_docs): | |
56 8000 26147 3.3 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]] | |
57 8000 21494 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]] | |
58 8000 37107 4.6 0.1 gammad = gamma[d, :] | |
59 8000 29111 3.6 0.1 expElogthetad = expElogtheta[d, :] | |
60 8000 99660 12.5 0.4 expElogbetad = expElogbeta[:, ids] | |
61 # The optimal phi_{dwk} is proportional to | |
62 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. | |
63 8000 79255 9.9 0.3 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 | |
64 | |
65 # Iterate between gamma and phi until convergence | |
66 381325 473084 1.2 1.8 for it in xrange(0, max_iters): | |
67 379940 771424 2.0 3.0 lastgamma = gammad | |
68 # We represent phi implicitly to save memory and time. | |
69 # Substituting the value of the optimal phi back into | |
70 # the update for gamma gives this update. Cf. Lee&Seung 2001. | |
71 379940 453302 1.2 1.8 gammad = alpha + expElogthetad * \ | |
72 379940 5402250 14.2 20.9 np.dot(cnts / phinorm, expElogbetad.T) | |
73 379940 12609292 33.2 48.7 expElogthetad = np.exp(_dirichlet_expectation(gammad)) | |
74 379940 3407782 9.0 13.2 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 | |
75 | |
76 379940 1417461 3.7 5.5 meanchange = mean_change(lastgamma, gammad) | |
77 379940 532688 1.4 2.1 if (meanchange < meanchangethresh): | |
78 6615 8396 1.3 0.0 break | |
79 8000 50124 6.3 0.2 gamma[d, :] = gammad | |
80 # Contribution of document d to the expected sufficient | |
81 # statistics for the M step. | |
82 8000 10207 1.3 0.0 if cal_delta: | |
83 8000 440468 55.1 1.7 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm) | |
84 | |
85 2 3 1.5 0.0 return (gamma, delta_component) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: lda.py | |
Function: _dirichlet_expectation at line 26 | |
Total time: 3.92028 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
26 @profile | |
27 def _dirichlet_expectation(alpha): | |
28 """ | |
29 For a vector theta ~ Dir(alpha), computes E[log(theta)] given alpha. | |
30 """ | |
31 379947 391707 1.0 10.0 if (len(alpha.shape) == 1): | |
32 379940 3197582 8.4 81.6 ret = _dirichlet_expectation_1d(alpha) | |
33 else: | |
34 7 14893 2127.6 0.4 ret = _dirichlet_expectation_2d(alpha) | |
35 379947 316096 0.8 8.1 return ret | |
File: lda.py | |
Function: _update_gamma at line 38 | |
Total time: 21.0102 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
38 @profile | |
39 def _update_gamma(X, expElogbeta, alpha, rng, max_iters, | |
40 meanchangethresh, cal_delta): | |
41 """ | |
42 E-step: update latent variable gamma | |
43 """ | |
44 | |
45 2 8 4.0 0.0 n_docs, n_vocabs = X.shape | |
46 2 4 2.0 0.0 n_topics = expElogbeta.shape[0] | |
47 | |
48 # gamma is non-normailzed topic distribution | |
49 2 4959 2479.5 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics)) | |
50 2 5692 2846.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma)) | |
51 # diff on component (only calculate it when keep_comp_change is True) | |
52 2 23 11.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None | |
53 | |
54 2 4 2.0 0.0 X_data = X.data | |
55 2 3 1.5 0.0 X_indices = X.indices | |
56 2 2 1.0 0.0 X_indptr = X.indptr | |
57 | |
58 8002 12836 1.6 0.1 for d in xrange(n_docs): | |
59 8000 26909 3.4 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]] | |
60 8000 21216 2.7 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]] | |
61 8000 36913 4.6 0.2 gammad = gamma[d, :] | |
62 8000 28754 3.6 0.1 expElogthetad = expElogtheta[d, :] | |
63 8000 106489 13.3 0.5 expElogbetad = expElogbeta[:, ids] | |
64 # The optimal phi_{dwk} is proportional to | |
65 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. | |
66 8000 80909 10.1 0.4 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 | |
67 | |
68 # Iterate between gamma and phi until convergence | |
69 381325 466912 1.2 2.2 for it in xrange(0, max_iters): | |
70 379940 785902 2.1 3.7 lastgamma = gammad | |
71 # We represent phi implicitly to save memory and time. | |
72 # Substituting the value of the optimal phi back into | |
73 # the update for gamma gives this update. Cf. Lee&Seung 2001. | |
74 379940 455282 1.2 2.2 gammad = alpha + expElogthetad * \ | |
75 379940 5387921 14.2 25.6 np.dot(cnts / phinorm, expElogbetad.T) | |
76 379940 7855665 20.7 37.4 expElogthetad = np.exp(_dirichlet_expectation(gammad)) | |
77 379940 3367420 8.9 16.0 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 | |
78 | |
79 379940 1301467 3.4 6.2 meanchange = mean_change(lastgamma, gammad) | |
80 379940 542443 1.4 2.6 if (meanchange < meanchangethresh): | |
81 6615 8323 1.3 0.0 break | |
82 8000 50913 6.4 0.2 gamma[d, :] = gammad | |
83 # Contribution of document d to the expected sufficient | |
84 # statistics for the M step. | |
85 8000 10525 1.3 0.1 if cal_delta: | |
86 8000 452723 56.6 2.2 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm) | |
87 | |
88 2 3 1.5 0.0 return (gamma, delta_component) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment