Created
August 7, 2016 17:12
-
-
Save timm/f316c8d842b567cd8357b9870f3be7f0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| #<< | |
| ################################################### | |
| CHUNK: near-linear time recursive clustering. | |
| Copyright (c) 2014, Tim Menzies, [email protected] | |
| All rights reserved. | |
| For details on this code, see chapter 12 of | |
| "Sharing Data and Models in Software Engineering" | |
| https://goo.gl/iPuA0f. | |
| Redistribution and use in source and binary forms, | |
| with or without modification, are permitted provided | |
| that the following conditions are met: | |
| 1. Redistributions of source code must retain the | |
| above copyright notice, this list of conditions and | |
| the following disclaimer. | |
| 2. Redistributions in binary form must reproduce the | |
| above copyright notice, this list of conditions and | |
| the following disclaimer in the documentation and/or | |
| other materials provided with the distribution. | |
| 3. Neither the name of the copyright holder nor the | |
| names of its contributors may be used to endorse or | |
| promote products derived from this software without | |
| specific prior written permission. | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS | |
| AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED | |
| WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
| FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT | |
| SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | |
| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
| NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR | |
| TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN | |
| ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | |
| ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| ################################################### | |
| #>> | |
| ## License | |
| CHUNK is released under the BSD 3-clause license | |
| that allows proprietary use and allows the software | |
| released under the license to be incorporated into | |
| proprietary products. Works based on CHUNK may be | |
| released under a proprietary license as closed | |
| source software. | |
| ## Installing CHUNK | |
| Download _chunk.py_ from http://unbox.org/open/tags/sharing/chunk/1.0/chunk.py | |
| ## Testing Your Installation | |
| Test the code using | |
| python chunk.py | |
| That call will automatically run a function | |
| that recursively clusters _nasa93_. | |
| That data from 93 NASA software projects | |
| from the years 1971 to 1987: | |
| def nasa93(): | |
| return [ | |
| [1980,4,2,4,6,6,2,4,4,4,4,3,4,4,4,3,7.5,72] | |
| ,[1980,3,2,4,3,3,2,2,4,5,5,3,4,3,3,3,20,72] | |
| ,[1984,3,2,4,3,3,2,2,4,5,4,3,4,3,3,3,6,24] | |
| ,[1980,3,2,4,3,3,2,2,4,5,5,3,4,3,3,3,100,360] | |
| ,[1985,3,2,4,3,3,2,2,4,5,3,3,2,3,3,3,11.3,36] | |
| ,[1980,3,2,4,3,3,4,2,4,4,4,2,1,3,3,3,100,215] | |
| # lines deleted from view | |
| ] | |
| In this data, each row is one project and the first | |
| column is the year of the project. Also, the second | |
| last column is the known size of the project | |
| (measured in thousands of lines of code). Finally, | |
| the last column is the known effort for that project | |
| (respectively). | |
| As to the other columns, these are the standard | |
| variables from the COCOMO model such as _rely_ | |
| (required reliability), _data_ (database size), | |
| _cplx_ (product complexity) and many others | |
| (e.g. _virt, turn, acap, aexp, | |
| pcap,vexp,vecp,modp,too,sced_). | |
| """ | |
| #<< | |
| def nasa93(): | |
| """year,rely,data,cplx,time,stor,virt,turn,acap, | |
| aexp, pcap,vexp,vecp,modp,too,sced,kloc,effort""" | |
| return [ | |
| [1980,4,2,4,6,6,2,4,4,4,4,3,4,4,4,3,7.5,72] | |
| ,[1980,3,2,4,3,3,2,2,4,5,5,3,4,3,3,3,20,72] | |
| ,[1984,3,2,4,3,3,2,2,4,5,4,3,4,3,3,3,6,24] | |
| ,[1980,3,2,4,3,3,2,2,4,5,5,3,4,3,3,3,100,360] | |
| ,[1985,3,2,4,3,3,2,2,4,5,3,3,2,3,3,3,11.3,36] | |
| ,[1980,3,2,4,3,3,4,2,4,4,4,2,1,3,3,3,100,215] | |
| ,[1983,3,2,4,3,3,2,2,4,5,4,3,4,3,3,3,20,48] | |
| ,[1982,3,2,4,3,3,2,2,4,3,3,3,1,3,3,3,100,360] | |
| ,[1980,3,2,4,3,6,2,2,4,5,5,3,4,3,3,3,150, 324] | |
| ,[1984,3,2,4,3,3,2,2,4,4,4,3,4,3,3,3,31.5,60] | |
| ,[1983,3,2,4,3,3,2,2,4,5,4,3,4,3,3,3,15, 48] | |
| ,[1984,3,2,4,3,6,2,2,4,4,3,3,4,3,3,3,32.5,60] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,25.9,117.6] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,24.6,117.6] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,7.7,31.2] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,8.2,36] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,9.7,25.2] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,2.2,8.4] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,3.5,10.8] | |
| ,[1982,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,66.6,352.8] | |
| ,[1985,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,19.7,60] | |
| ,[1985,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,66.6,300] | |
| ,[1985,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,29.5,120] | |
| ,[1986,4,3,3,4,3,3,3,3,4,4,3,3,3,3,3,15,90] | |
| ,[1986,4,3,4,3,3,3,3,3,4,4,3,3,3,3,3,38,210] | |
| ,[1986,3,3,3,3,3,3,3,3,4,4,3,3,3,3,3,10,48] | |
| ,[1982,3,5,4,5,5,2,4,5,4,3,2,4,5,5,2,15.4,70] | |
| ,[1982,3,5,4,5,5,2,4,5,4,3,2,4,5,5,2,48.5,239] | |
| ,[1982,3,5,4,5,5,2,4,5,4,3,2,4,5,5,2,16.3,82] | |
| ,[1982,3,5,4,5,5,2,4,5,4,3,2,4,5,5,2,12.8,62] | |
| ,[1982,3,5,4,5,5,2,4,5,4,3,2,4,5,5,2,32.6,170] | |
| ,[1982,3,5,4,5,5,2,4,5,4,3,2,4,5,5,2,35.5,192] | |
| ,[1985,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,5.5,18] | |
| ,[1987,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,10.4,50] | |
| ,[1987,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,14,60] | |
| ,[1986,4,3,4,3,3,3,3,3,3,3,3,3,3,3,3,6.5,42] | |
| ,[1986,3,3,4,3,3,3,3,3,3,3,3,3,3,3,3,13,60] | |
| ,[1986,3,3,4,3,3,3,3,3,3,4,3,4,4,4,3,90,444] | |
| ,[1986,3,3,4,3,3,3,3,3,3,3,3,3,3,3,3,8,42] | |
| ,[1986,3,3,4,4,3,3,3,3,3,3,3,3,3,3,3,16,114] | |
| ,[1980,3,4,4,5,4,2,4,4,3,4,2,4,4,3,2,177.9,1248] | |
| ,[1979,5,4,4,5,5,3,3,5,5,5,3,4,4,4,2,227,1181] | |
| ,[1977,3,4,5,3,3,2,3,4,3,5,2,3,4,3,2,70,278] | |
| ,[1979,4,2,4,3,3,2,2,3,3,3,3,4,4,3,2,0.9,8.4] | |
| ,[1980,4,5,5,6,6,4,4,3,3,3,2,2,3,3,4,32,1350] | |
| ,[1980,4,4,4,5,6,4,4,4,4,4,4,4,4,3,3,53,480] | |
| ,[1983,4,3,5,5,5,4,4,3,3,3,2,2,3,3,4,16.3,480] | |
| ,[1983,4,3,5,5,5,4,4,3,3,3,2,2,3,3,4,6.2,12] | |
| ,[1983,4,3,5,5,5,4,4,3,3,3,2,2,3,3,4,3,38] | |
| ,[1977,4,2,5,5,6,2,3,5,5,5,1,1,4,4,3,41,599] | |
| ,[1977,4,2,5,5,6,2,3,5,5,5,1,1,4,4,3,24,430] | |
| ,[1982,3,4,2,3,3,4,3,4,4,3,3,3,4,4,3,282.1,1368] | |
| ,[1982,4,4,2,3,3,3,4,4,4,3,3,3,4,3,3,284.7,973] | |
| ,[1982,4,4,3,3,3,2,2,3,4,4,3,4,3,3,3,79,400] | |
| ,[1977,2,3,3,3,3,2,2,4,4,5,3,4,2,2,4,423,2400] | |
| ,[1977,3,3,3,3,3,2,3,4,5,5,2,4,4,3,3,190,420] | |
| ,[1984,3,3,4,3,4,3,3,4,4,3,3,4,4,3,4,47.5,252] | |
| ,[1980,5,3,6,4,4,2,2,3,4,3,3,3,2,4,3,21,107] | |
| ,[1983,3,4,4,5,3,3,4,4,4,4,3,4,2,2,4,78,571.4] | |
| ,[1984,3,4,4,5,3,3,4,4,4,4,3,4,2,2,4,11.4,98.8] | |
| ,[1985,3,4,4,5,3,3,4,4,4,4,3,4,2,2,4,19.3,155] | |
| ,[1979,4,3,5,4,4,2,4,4,3,3,4,4,2,5,4,101,750] | |
| ,[1979,4,3,4,4,4,2,4,3,4,3,3,3,2,5,3,219,2120] | |
| ,[1979,4,3,4,4,4,2,4,3,4,3,3,3,2,5,3,50,370] | |
| ,[1976,4,3,6,4,4,2,2,4,3,3,4,4,4,4,3,70,458] | |
| ,[1979,4,3,6,4,4,2,2,4,3,3,4,4,4,4,3,271,2460] | |
| ,[1971,3,3,3,3,3,2,2,4,4,4,3,4,3,2,3,90,162] | |
| ,[1980,3,3,3,3,3,2,2,4,4,4,3,4,3,2,3,40,150] | |
| ,[1979,4,3,4,4,3,2,2,4,4,4,3,4,3,3,3,137,636] | |
| ,[1977,4,3,4,4,3,4,2,4,4,4,3,4,3,1,3,150,882] | |
| ,[1976,5,3,4,4,3,2,2,4,4,4,3,4,3,3,3,339,444] | |
| ,[1983,2,4,2,3,3,4,2,4,4,4,3,4,3,2,3,240,192] | |
| ,[1978,4,3,4,3,5,2,3,4,4,4,4,4,2,2,2,144,576] | |
| ,[1979,3,2,3,3,5,2,3,4,4,4,4,4,2,2,2,151,432] | |
| ,[1979,3,2,4,3,5,2,3,4,4,4,4,4,2,2,2,34,72] | |
| ,[1979,3,3,4,3,5,2,3,4,4,4,4,4,2,2,2,98,300] | |
| ,[1979,3,3,4,3,5,2,3,4,4,4,4,4,2,2,2,85,300] | |
| ,[1982,3,2,3,3,5,2,3,4,4,4,4,4,2,2,2,20,240] | |
| ,[1978,3,2,3,3,5,2,3,4,4,4,4,4,2,2,2,111,600] | |
| ,[1978,4,5,4,3,5,2,3,4,4,4,4,4,2,2,2,162,756] | |
| ,[1978,4,4,5,3,5,2,3,4,4,4,4,4,2,2,2,352,1200] | |
| ,[1979,4,3,5,3,5,2,3,4,4,4,4,4,2,2,2,165,97] | |
| ,[1984,4,3,5,4,4,2,5,4,3,3,4,4,4,5,4,60,409] | |
| ,[1984,4,3,5,4,4,2,5,4,3,3,4,4,4,5,4,100,703] | |
| ,[1977,5,4,5,6,6,3,3,4,4,4,4,4,4,3,4,165,4178.2] | |
| ,[1977,5,4,5,6,6,3,3,4,4,4,4,4,4,3,4,65,1772.5] | |
| ,[1977,5,4,5,6,6,3,2,4,4,4,4,4,4,3,4,70,1645.9] | |
| ,[1977,5,4,6,6,6,3,3,4,4,4,4,4,4,3,4,50,1924.5] | |
| ,[1982,5,2,5,5,6,2,2,4,2,3,1,2,2,4,4,7.25,648] | |
| ,[1980,5,4,5,6,6,3,3,4,4,4,4,4,4,3,4,233,8211] | |
| ,[1975,4,2,4,3,3,2,2,3,3,4,3,3,4,1,3,302,2400] | |
| ,[1974,5,2,6,6,5,2,2,4,5,4,1,4,1,1,4,980,4560] | |
| ,[1975,3,2,4,3,3,2,2,5,3,5,4,4,3,2,3,350,720] | |
| ] | |
| #>> | |
| """ | |
| When _chunk.py_ is loaded into Python, it prints the following tree. | |
| Note that the 93 projects | |
| in _nasa93_ where first sub-divided into two groups of 47 items. | |
| This division repeated to find 16 leaf clusters of size five of six. | |
| 93 | |
| ... # <== [some lines omitted] | |
| |.. 47 | |
| |.. |.. 23 | |
| |.. |.. |.. 11 | |
| |.. |.. |.. |.. 5. | |
| |.. |.. |.. |.. 6. #<=== clusterX | |
| |.. |.. |.. 12 | |
| |.. |.. |.. |.. 6. | |
| |.. |.. |.. |.. 6. | |
| |.. |.. 24 | |
| |.. |.. |.. 12 | |
| |.. |.. |.. |.. 6. | |
| |.. |.. |.. |.. 6. | |
| |.. |.. |.. 12 | |
| |.. |.. |.. |.. 6. | |
| |.. |.. |.. |.. 6. #<=== clusterY | |
| Just to show that the clustering works, we note that some of these | |
| clusters are very uniform. For example, _clusterY_ contains six | |
| projects, all from 1979, and all with mostly | |
| repeated values. In the following, the columns marked | |
| _rdctsvTaApvlmTS_, these hold values for | |
| _rely,data,cplx,time, stor,virt,turn,acap,aexp, | |
| pcap,vexp,lexp,modp, tool,sced_ (respectively). Also, | |
| if a value is the same as the one above, this is shown with a | |
| "ditto" mark, _"."_. | |
| year rdctsvTaApvlmTS ksloc effort | |
| ==== =============== ===== ===== | |
| project 88: 1979 424332233334432 9.7 = 25.2 | |
| project 89: . ............... 8.2 = 36 | |
| project 90: . ............... 7.7 = 31.2 | |
| project 91: . ............... 3.5 = 10.8 | |
| project 92: . ............... 2.2 = 8.4 | |
| project 93: . ............... 0.9 = 8.4 | |
| Other clusters are more diverse. For example, _clusterX_ | |
| contains data from nearly ten years of software development. | |
| As might be expected, those projects are not all similar so | |
| there | |
| are fewer ditto marks: | |
| year rdctsvTaApvlmTS ksloc effort | |
| ==== =============== ===== ===== | |
| project 52: 1977 345332343523432 70 = 278 | |
| project 53: 1979 5.4553.55.34.4. 227 = 1181 | |
| project 54: . 43.4424343.3253 50 = 370 | |
| project 55: 1980 34.5...43424432 177.9 = 1248 | |
| project 56: 1984 .2.36.2.433.3.3 32.5 = 60 | |
| project 57: 1986 .3.433333..3... 16 = 114 | |
| ## Applying CHUNK to Other Models | |
| CHUNK recursively divides data in the format of | |
| the _nasa93()_ function show above. | |
| So, to apply CHUNK to different data, write | |
| another function like _nasa93_. | |
| Also, CHUNK needs to know some meta-knowledge about the data. | |
| For example, column of data, | |
| CHUNK needs to knows the low and high value: | |
| """ | |
| def lohi(m,x): | |
| if m == nasa93: | |
| if x==16: return 0,1000 # range of KLOC | |
| elif x==0 : return 1971,1987 # years | |
| else : return 1,6 # 1..6 = vlow,low,nom, | |
| # hi,vhi,xhi | |
| else: | |
| raise Exception('[%s] unknown' % m.__name__) | |
| """ | |
| Sometimes, there is knowledge that some variables | |
| are more important than others. In the case of | |
| _nasa93_, we have no such knowledge so everything | |
| has the same weight. | |
| """ | |
| def weight(m,x): | |
| if m == nasa93: | |
| return 1 | |
| else: | |
| raise Exception('[%s] unknown' % m.__name__) | |
| """ | |
| CHUNK groups together projects with similar | |
| decisions. To do that, CHUNK isolates those | |
| decisions (which it calls _dec_) from the objectives | |
| (which it calls _obj_). | |
| """ | |
| def project2Slots(project = nasa93()): | |
| "Returns 'Slots'- a struct with named fields." | |
| return [Slots( dec = col[:-1], | |
| obj = [col[-1] | |
| ]) for col in project] | |
| """ | |
| In summary, to apply CHUNK to other data, | |
| write a new model function (like _nasa93_) then | |
| modify _lohi_ and _weight_ to add in the meta-knowledge. | |
| Optionally, you may also need to modify _project2Slots_ to convert your | |
| model's data format into the _Slots_ needed by CHUNK. | |
| # Inside CHUNK | |
| ## Roadmap to Functions. | |
| As shown below, _chunk_ is the main function that | |
| recurively divides the data. | |
| In that division, the _dist_ function reports the | |
| distance between data items (and that function is | |
| helped by _normalize_ and _squaredDifferences_). | |
| To divide the data, we use _fastdiv_ which in turn | |
| uses _twoDistantPoints_ to separate the data. Note | |
| that the main _chunk_ function recurses on the | |
| divisions generated by _fastdiv_. That recursion is | |
| controlled by the parameters initialized in | |
| _settings_. | |
| After that, this code is all low-level Python tricks | |
| such as _align_, which pretty prints columns of | |
| data. Two other important tricks are the _leafs_ | |
| iterator (that walks over the tree of clusters found | |
| by _chunk_) and _Slots_ which is a generic Python | |
| struct that supports reading and writing to names | |
| fields. | |
| Finally, the function _\_nasa93_ shows an example of | |
| how to chunk the _nasa93_ data, shown above. | |
| All these functions are detailed, below. | |
| ## Distance Calculations | |
| CHUNK finds groups of similar things. But what does | |
| "similar" mean? | |
| To answer that questions, CHUNK uses the following | |
| _dist_ function. This function excepts two examples | |
| _i,j_ from some model _m_. It then decides _how_ to | |
| compare them (the default is to use decisions _dec_ | |
| collected together by _project2Slots_ - see above). | |
| Next, _dist_ sums the squares of the differences | |
| between each part of the example. Finally, _dist_ | |
| returns the square root of that sum, normalized to | |
| the range zero to one. | |
| """ | |
| def dist(m,i,j, how=lambda x: x.dec): | |
| "Euclidean distance 0 <= d <= 1 between decisions" | |
| d1,d2 = how(i), how(j) | |
| deltas, n = 0, 0 | |
| for d,x in enumerate(d1): | |
| y = d2[d] | |
| v1 = normalize(m, d, x) | |
| v2 = normalize(m, d, y) | |
| w = weight(m,d) | |
| deltas,n = squaredDifference(m,v1,v2,w,deltas,n) | |
| return deltas**0.5 / (n+0.0001)**0.5 | |
| """ | |
| (Aside: the last line adds in 0.0001 to avoid any | |
| divide-by-zero errors.) | |
| ### Normalize | |
| When computing distances, it is standard practice to | |
| _normalize_ all numeric values to the range zero to one, | |
| min to max. To do that, CHUNK uses the _lohi_ | |
| function described above. | |
| """ | |
| def normalize(m,x,value) : | |
| if not The.normalize : return value | |
| if value == The.missing : return value | |
| if isinstance(value,str) : return value | |
| lo, hi = lohi(m,x) | |
| return (value - lo) / (hi - lo + 0.0001) | |
| """ | |
| As seen above, _normalize_ has some special cases. | |
| Firstly, if some global flag has disabled | |
| normalization, we just return the unaltered value. | |
| Similarly, if we are trying to normalize some | |
| missing value or some non-numeric value, we also | |
| return the unaltered value. | |
| ### SquaredDifference | |
| The _dist_ function needs to compute the square of | |
| the differences between two values _v1,v2_ from some | |
| model _m_. It is assumed that each value is | |
| weighted; i.e. it can add up to some amount _most_ | |
| to the distance measure. This is done using the | |
| _squaredDifference_ function- which uses some | |
| distance heuristic first proposed by David Aha | |
| \cite{aha91}. For example, if in doubt, assume the | |
| maximum distance. Such doubts arise if (e.g.) we are | |
| comparing missing values. | |
| """ | |
| def squaredDifference(m,v1,v2,most,sum=0,n=0): | |
| def furthestFromV1() : | |
| return 0 if v1 > 0.5 else 1 | |
| if not v1 == v2 == The.missing: | |
| if v1 == The.missing: | |
| v1,v2 = v2,v1 # at the very least, v1 is known | |
| if isinstance(v1,str) and isinstance(v2,str): | |
| if v2 == The.missing or v1 != v2 : | |
| inc = 1 | |
| else: | |
| if v2 == The.missing: v2 = furthestFromV1() | |
| inc = (v1 - v2)**2 | |
| return (sum + most*inc, # sum of incs, so far | |
| n + most) # sum of max incs, so far | |
| """ | |
| Note that the _squaredDifference_ function knows how | |
| to handle numerics as well as non-numerics | |
| differently to numerics. Two non-numerics have zero | |
| distance if they are the same (and distance equal to | |
| max, otherwise). | |
| ## Dividing the Data | |
| ### FastDiv | |
| With the above machinery, we can very quickly | |
| recursively divide some training data in half using | |
| _fastdiv_. This function finds the distance _c_ | |
| between two distance items _west,east_. All data has | |
| some distance _a,b_ to _west,east_. Using _a,b,c_, | |
| the _fastdiv_ function uses the cosine rule to sort | |
| the data along where it falls on a line running from | |
| _west_ to _east_. This function then returns the | |
| data, divided on the median value. | |
| """ | |
| def fastdiv(m,data,details, how): | |
| "Divide data at median of two distant items." | |
| west, east = twoDistantPoints(m,data,how) | |
| c = dist(m, west, east, how) | |
| for i in data: | |
| a = dist(m,i, west, how) | |
| b = dist(m,i, east, how) | |
| i.x = (a*a + c*c - b*b)/(2*c) # cosine rule | |
| data = sorted(data,key=lambda i: i.x) | |
| n = len(data)/2 | |
| details.also(west=west, east=east, c=c, cut=data[n].x) | |
| return data[:n], data[n:] | |
| """ | |
| ### TwoDistantPoints | |
| CHUNK is fast since it uses the linear-time | |
| "FastMap" heuristic \cite{fal95} to find the | |
| _twoDistantPoints_. This heuristic starts by | |
| picking any item at random. Next, it finds the | |
| furthest item from the first pick. Finally, it finds | |
| the furthest item from the second item. Note that | |
| this is fast since this | |
| requires only one scans of the data for each pick. | |
| While the two found items may not be the most | |
| distant points, they are far enough away to guide | |
| data division. | |
| """ | |
| def twoDistantPoints(m,data,how): | |
| def furthest(i): | |
| out,d= i,0 | |
| for j in data: | |
| tmp = dist(m,i,j,how) | |
| if tmp > d: out,d = j,tmp | |
| return out | |
| one = any(data) # 1) pick any thing | |
| west = furthest(one) # 2) far from thing | |
| east = furthest(west) # 3) far from west | |
| return west,east | |
| """ | |
| While _twoDistantPoints_ looks | |
| simple, it actually offers a profound summation of | |
| important aspects of a data set. According to John | |
| Platt, this FastMap heuristic belongs to a class of | |
| algorithms that find approximations to the | |
| eigenvectors of a data set \cite{platt05}. Spectral | |
| learners \cite{kamvar03} use these eigenvectors to | |
| reason along the most important dimensions in a data | |
| set. | |
| ### Settings | |
| By applying _fastdiv_ recursively, we can build a | |
| binary tree whose leaves contain similar | |
| examples. That tree generation is controlled by the | |
| following _settings_. By default, we will stop when | |
| any leaf has less than _minSize=10_ or if we have | |
| recursed more that _depthMax=10_ items. Also, just | |
| to make sure we get at least a few branches in the | |
| tree, we will recurse at least _minSize=2_ times. | |
| Further, when we recurse, if _verbose=True_, we will | |
| trace the traversely by printing one _b4_ string for | |
| each level of the recursion. Finally, when computing | |
| distances, this code uses _how=x.dec_; i.e. the | |
| decisions of each item in the data. | |
| """ | |
| def settings(**has): | |
| "Return control settings for recursive descent." | |
| return Slots(minSize = 10, # min leaf size | |
| depthMin= 2, # no pruning till depthMin | |
| depthMax= 10, # max tree depth | |
| b4 = '|.. ', # indent string | |
| verbose = False, # show trace info? | |
| how= lambda x:x.dec # how to measure distance | |
| ).override(has) | |
| """ | |
| ### Chunk (main function) | |
| Finally, we arrive at the main _chunk_ function. | |
| This function uses the above _settings_ to build a | |
| tree that recursively divides the data. The _chunk_ | |
| function holds those _settings_ in its _slots_ | |
| variable (which is set on the first line of the | |
| function, if it is not already known). Local | |
| functions within _chunk_ use these _slots_ to | |
| control how the tree is built. | |
| """ | |
| def chunk(m,data,slots=None, lvl=0,up=None): | |
| "Return a tree of split data." | |
| slots = slots or settings() | |
| def tooFew() : | |
| return len(data) < slots.minSize | |
| def tooDeep(): | |
| return lvl > slots.depthMax | |
| def show(suffix): | |
| if slots.verbose: | |
| print slots.b4*lvl + str(len(data)) + suffix | |
| tree= Slots(_up=up,value=None,_left=None,_right=None) | |
| if tooDeep() or tooFew(): | |
| show(".") | |
| tree.value = data | |
| else: | |
| show("") | |
| wests,easts = fastdiv(m, data, tree, slots.how) | |
| if not worse(wests, easts, tree) : | |
| tree._left = chunk(m, wests, slots, lvl+1, tree) | |
| if not worse(easts, wests, tree) : | |
| tree._right = chunk(m, easts, slots, lvl+1, tree) | |
| return tree | |
| def worse(down1,down2,here): return False | |
| """ | |
| Note some subtleties in the above code. Firstly, | |
| since we use _fastdiv_, our _chunk_ function is a | |
| very fast method to divide data. | |
| Secondly, the function _worse_ is a hook for any | |
| clever pruning you might want to add to this process | |
| (this _chunk_ function only recurses on subtrees | |
| that are not _worse_). While we do not use _worse_ | |
| here, this function could be used to prune sub-trees | |
| that fail some test; e.g. that do not reduce the | |
| variance of the current tree. | |
| Thirdly, _chunk_ returns a tree of _Slots_ where | |
| each node contains pointers to its _\_left_ and | |
| _\_right_ kids as well as a pointer _\_up_ to the | |
| parent node (which, for the root node, points to | |
| _None_). Note that all leaves of this tree have | |
| empty child pointers. Such childless leaves hold | |
| the items that fall into that leaf in the _value_ | |
| field. | |
| ## Support Utilities | |
| ### Some Standard Tricks | |
| """ | |
| import sys,math,random | |
| sys.dont_write_bytecode = True # disable writing .pyc files | |
| seed = random.seed # convenient shorthand | |
| any = random.choice # another convenient shorthand | |
| def say(x): | |
| "Output a string, no trailing new line." | |
| sys.stdout.write(x) | |
| def showd(d): | |
| """Catch key values to string, sorted on keys. | |
| Ignore hard to read items (marked with '_').""" | |
| return ' '.join([':%s %s' % (k,v) | |
| for k,v in | |
| sorted(d.items()) | |
| if not "_" in k]) | |
| """ | |
| _Slots_ is based on a Peter Norvig trick from | |
| http://norvig.com/python-iaq.html. When all you want | |
| to do is create an object that holds data in several | |
| fields, the following will do. For an example of | |
| using _Slots_, see _settings_ (above). | |
| """ | |
| class Slots(): | |
| "Place to read/write named slots." | |
| id = -1 | |
| def __init__(i,**d) : | |
| i.id = Slots.id = Slots.id + 1 | |
| i.override(d) | |
| def override(i,d): i.__dict__.update(d); return i | |
| def also(i, **d) : i.override(d) | |
| def __eq__(i,j) : return i.id == j.id | |
| def __ne__(i,j) : return i.id != j.id | |
| def __repr__(i) : return '{' + showd(i.__dict__) + '}' | |
| """ | |
| Note that _Slots_ can pretty print themselves using | |
| the _showd_ function (shown above). Also, since our | |
| _Slots_ have a unique _id_, then we can quickly test | |
| for equality and inequality. | |
| ### Tree Iterators | |
| To simplify the processing of trees, we define some | |
| iterators to return all _nodes_ or just the _leafs_ | |
| of the tree. | |
| """ | |
| def nodes(t,lvl=0): | |
| "Iterator. Return all nodes." | |
| if t: | |
| yield lvl,t | |
| for t1 in [t._left,t._right]: | |
| for lvl1,leaf in nodes(t1,lvl+1): | |
| yield lvl1,leaf | |
| def leafs(t): | |
| "Iterator: returns all leaf nodes." | |
| for lvl,node in nodes(t): | |
| if not node._left and not node._right: | |
| yield lvl,node | |
| """ | |
| ### Pretty Printing | |
| The _ditto_ function marks repeated entries in a column | |
| with a "_._". | |
| """ | |
| def ditto(lst,old,mark="."): | |
| """Show 'mark' if an item of lst is same as old. | |
| As a side-effect, update cache of 'old' values.""" | |
| out = [] | |
| for i,now in enumerate(lst): | |
| before = old.get(i,None) # get old it if exists | |
| out += [mark if before == now else now] | |
| old[i] = now # next time, 'now' is the 'old' value | |
| return out # the lst with ditto marks inserted | |
| """ | |
| Once we "ditto" a list of lists, we have to lay it | |
| out and pretty print it for the users. | |
| """ | |
| def align(lsts): | |
| "Print, filled to max width of each column." | |
| widths = {} | |
| for lst in lsts: # pass1- find column max widths | |
| for n,x in enumerate(lst): | |
| w = len('%s' % x) | |
| widths[n] = max(widths.get(n,0),w) | |
| for lst in lsts: # pass2- print to max width | |
| for n,x in enumerate(lst): | |
| say(('%s' % x).rjust(widths[n],' ')) | |
| print "" | |
| print "" | |
| """ | |
| # Putting it all Together | |
| The following code is an example of how to use | |
| _chunk_. | |
| ## _nasa93 | |
| The following code initializes some globals then | |
| builds a tree of clusters from _nasa93_ using | |
| _chunk_. Then, using the _leafs_ iterator, this | |
| code traverses the leaf clusters of the tree to | |
| pretty prints the clusters in aligned columns (using | |
| ditto marks). | |
| """ | |
| The = Slots(normalize=True,missing="?") | |
| def _chunkDemo(model=nasa93): | |
| seed(1) | |
| data = project2Slots( model() ) | |
| options= settings(verbose = True, | |
| minSize = len(data)**0.5) | |
| tree = chunk(model, data ,options) | |
| eg,cid = 0,0 | |
| for lvl,leaf in leafs(tree): | |
| context = leaf.value | |
| cid += 1 | |
| print "----| cluster",cid,"|","-"*35 | |
| lines = [] | |
| dittos = {} | |
| for row in sorted(context,key=lambda x:x.dec[0]): | |
| eg += 1 | |
| pre = ["project ", eg,": "] | |
| params = ditto(row.dec,dittos) | |
| params[0] = str(params[0]) + " " | |
| params[-1] = " " + str(params[-1]) | |
| lines += [pre + params + [" = "] + row.obj] | |
| align(lines) | |
| _chunkDemo() | |
| """ | |
| Note that we override the default _settings_ to | |
| offer control values that make more sense for small | |
| data sets like _nasa93_ (see the _minSize_ setting). | |
| To change the default load behavior, change the last line | |
| of _chnunk.py_ (which is currently _\_chunkDemo()_). | |
| """ |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note: we've getting some results on recursively clustering binary attributes suggesting that another polar co-ordinate system might be better for that class of problem. We are preparing a paper on that: ping me if interested.