Created
July 26, 2018 19:38
-
-
Save joinr/775aa35476f278a22183f832a318e021 to your computer and use it in GitHub Desktop.
error in incanter's delegation to core.matrix for building datasets from xls due to sparse data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns incdebug.core | |
(:require [incanter [excel :as xls] [core :as i]] | |
[clojure.core.matrix.impl.dataset :as ds] | |
[clojure.core.matrix.protocols :as mp])) | |
;;this is what incanter is doing....we'll tease it apart | |
;;to debug. | |
#_(defn- read-sheet [rows-it header-keywords] | |
(let [colnames (incanter.excel.cells/read-line-values (first rows-it)) | |
rows (->> (rest rows-it) | |
(map incanter.excel.cells/read-line-values) | |
(filter (complement empty?)))] | |
(i/dataset | |
(if header-keywords | |
(map keyword colnames) | |
colnames) | |
rows))) | |
(def path "c:\\users\\tom\\Documents\\UK2010.xls") | |
;;getting a hold of the row objects. | |
(def rows-it | |
(with-open [in-fs (clojure.java.io/input-stream path)] | |
(vec (iterator-seq (. (incanter.excel.workbook/get-workbook-sheet | |
(incanter.excel.workbook/create-workbook-object | |
path nil in-fs) 0) iterator))))) | |
;;col-names, first line of rows... | |
(def cname (incanter.excel.cells/read-line-values (first rows))) | |
;;rows, cell-line-values after the header row... | |
(def rows (->> (rest rows-it) | |
(map incanter.excel.cells/read-line-values) | |
(filter (complement empty?)))) | |
;;THis is the source for core.matrix.impl.dataset/dataset-from-rows | |
;;Note the call to mp/get-columns, that will end up being the culprit! | |
#_(defn dataset-from-rows | |
"Creates a dataset from the given rows of data, using the specified column names. | |
rows should be a sequence of data rows, where each row should be a sequence or | |
vector that has one element for each column." | |
[col-names rows] | |
(let [^IPersistentVector col-names (vec col-names) | |
cc (count col-names) | |
rc (long (mp/dimension-count rows 0)) | |
^IPersistentVector cols (if (empty? rows) | |
(vec (repeat cc [])) | |
(vec (mp/get-columns rows)))] | |
(when (not= cc (count cols)) | |
(error "Mismatched number of columns, have: " cc " column names")) | |
(DataSet. col-names cols [rc cc]))) | |
;;This is what mp/get-columns is doing, per impl.persistent_vector | |
;;for each column j in the second dimension (column) of m, | |
;;for each row i of m, | |
;;we build up a vector of column entries by traversing | |
;;the rows in m and extracting the 1-dimensional | |
;;element at j. mp/get-1d for vectors is just nth. | |
;;So, effectively build a column j from a sequence of | |
;;row vectors by mapping (nth % j) over all the row vectors. | |
;;THis is great, so long as the row vectors have entries for | |
;;EVERY value of j indicated by the first row vector..... | |
;;If we have sparse columns, they aren't populated / normalized | |
;;with nil, so we end up with vectors that may fail when | |
;;nth is called with an expected column index! | |
(defn get-columns-1 [m] | |
(vec (for [j (range (mp/dimension-count m 1))] | |
(mapv #(mp/get-1d % j) m)))) | |
;;expect to have 144 entries per row. | |
;;651 rows not including header. | |
;;column 0 has entries for every row. | |
;;incdebug.core> (mapv #(mp/get-1d % 0) rows) | |
[1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0 15.0 | |
16.0 17.0 18.0 19.0 20.0 21.0 22.0 23.0 24.0 25.0 26.0 27.0 28.0 29.0 | |
30.0 31.0 32.0 33.0 34.0 35.0 36.0 37.0 38.0 39.0 40.0 41.0 42.0 43.0 | |
44.0 45.0 46.0 47.0 48.0 49.0 50.0 51.0 52.0 53.0 54.0 55.0 56.0 57.0 | |
58.0 59.0 60.0 61.0 62.0 63.0 64.0 65.0 66.0 67.0 68.0 69.0 70.0 71.0 | |
72.0 73.0 74.0 75.0 76.0 77.0 78.0 79.0 80.0 81.0 82.0 83.0 84.0 85.0 | |
86.0 87.0 88.0 89.0 90.0 91.0 92.0 93.0 94.0 95.0 96.0 97.0 98.0 99.0 | |
100.0 101.0 102.0 103.0 104.0 105.0 106.0 107.0 108.0 109.0 110.0 | |
111.0 112.0 113.0 114.0 115.0 116.0 117.0 118.0 119.0 120.0 121.0 | |
122.0 123.0 124.0 125.0 126.0 127.0 128.0 129.0 130.0 131.0 132.0 | |
133.0 134.0 135.0 136.0 137.0 138.0 139.0 140.0 141.0 142.0 143.0 | |
144.0 145.0 146.0 147.0 148.0 149.0 150.0 151.0 152.0 153.0 154.0 | |
155.0 156.0 157.0 158.0 159.0 160.0 161.0 162.0 163.0 164.0 165.0 | |
166.0 167.0 168.0 169.0 170.0 171.0 172.0 173.0 174.0 175.0 176.0 | |
177.0 178.0 179.0 180.0 181.0 182.0 183.0 184.0 185.0 186.0 187.0 | |
188.0 189.0 190.0 191.0 192.0 193.0 194.0 195.0 196.0 197.0 198.0 | |
199.0 200.0 201.0 202.0 203.0 204.0 205.0 206.0 207.0 208.0 209.0 | |
210.0 211.0 212.0 213.0 214.0 215.0 216.0 217.0 218.0 219.0 220.0 | |
221.0 222.0 223.0 224.0 225.0 226.0 227.0 228.0 229.0 230.0 231.0 | |
232.0 233.0 234.0 235.0 236.0 237.0 238.0 239.0 240.0 241.0 242.0 | |
243.0 244.0 245.0 246.0 247.0 248.0 249.0 250.0 251.0 252.0 253.0 | |
254.0 255.0 256.0 257.0 258.0 259.0 260.0 261.0 262.0 263.0 264.0 | |
265.0 266.0 267.0 268.0 269.0 270.0 271.0 272.0 273.0 274.0 275.0 | |
276.0 277.0 278.0 279.0 280.0 281.0 282.0 283.0 284.0 285.0 286.0 | |
287.0 288.0 289.0 290.0 291.0 292.0 293.0 294.0 295.0 296.0 297.0 | |
298.0 299.0 300.0 301.0 302.0 303.0 304.0 305.0 306.0 307.0 308.0 | |
309.0 310.0 311.0 312.0 313.0 314.0 315.0 316.0 317.0 318.0 319.0 | |
320.0 321.0 322.0 323.0 324.0 325.0 326.0 327.0 328.0 329.0 330.0 | |
331.0 332.0 333.0 334.0 335.0 336.0 337.0 338.0 339.0 340.0 341.0 | |
342.0 343.0 344.0 345.0 346.0 347.0 348.0 349.0 350.0 351.0 352.0 | |
353.0 354.0 355.0 356.0 357.0 358.0 359.0 360.0 361.0 362.0 363.0 | |
364.0 365.0 366.0 367.0 368.0 369.0 370.0 371.0 372.0 373.0 374.0 | |
375.0 376.0 377.0 378.0 379.0 380.0 381.0 382.0 383.0 384.0 385.0 | |
386.0 387.0 388.0 389.0 390.0 391.0 392.0 393.0 394.0 395.0 396.0 | |
397.0 398.0 399.0 400.0 401.0 402.0 403.0 404.0 405.0 406.0 407.0 | |
408.0 409.0 410.0 411.0 412.0 413.0 414.0 415.0 416.0 417.0 418.0 | |
419.0 420.0 421.0 422.0 423.0 424.0 425.0 426.0 427.0 428.0 429.0 | |
430.0 431.0 432.0 433.0 434.0 435.0 436.0 437.0 438.0 439.0 440.0 | |
441.0 442.0 443.0 444.0 445.0 446.0 447.0 448.0 449.0 450.0 451.0 | |
452.0 453.0 454.0 455.0 456.0 457.0 458.0 459.0 460.0 461.0 462.0 | |
463.0 464.0 465.0 466.0 467.0 468.0 469.0 470.0 471.0 472.0 473.0 | |
474.0 475.0 476.0 477.0 478.0 479.0 480.0 481.0 482.0 483.0 484.0 | |
485.0 486.0 487.0 488.0 489.0 490.0 491.0 492.0 493.0 494.0 495.0 | |
496.0 497.0 498.0 499.0 500.0 501.0 502.0 503.0 504.0 505.0 506.0 | |
507.0 508.0 509.0 510.0 511.0 512.0 513.0 514.0 515.0 516.0 517.0 | |
518.0 519.0 520.0 521.0 522.0 523.0 524.0 525.0 526.0 527.0 528.0 | |
529.0 530.0 531.0 532.0 533.0 534.0 535.0 536.0 537.0 538.0 539.0 | |
540.0 541.0 542.0 543.0 544.0 545.0 546.0 547.0 548.0 549.0 550.0 | |
551.0 552.0 553.0 554.0 555.0 556.0 557.0 558.0 559.0 560.0 561.0 | |
562.0 563.0 564.0 565.0 566.0 567.0 568.0 569.0 570.0 571.0 572.0 | |
573.0 574.0 575.0 576.0 577.0 578.0 579.0 580.0 581.0 582.0 583.0 | |
584.0 585.0 586.0 587.0 588.0 589.0 590.0 591.0 592.0 593.0 594.0 | |
595.0 596.0 597.0 598.0 599.0 600.0 601.0 602.0 603.0 604.0 605.0 | |
606.0 607.0 608.0 609.0 610.0 611.0 612.0 613.0 614.0 615.0 616.0 | |
617.0 618.0 619.0 620.0 621.0 622.0 623.0 624.0 625.0 626.0 627.0 | |
628.0 629.0 630.0 631.0 632.0 633.0 634.0 635.0 636.0 637.0 638.0 | |
639.0 640.0 641.0 642.0 643.0 644.0 645.0 646.0 647.0 648.0 649.0 | |
650.0 2.9687604E7] | |
;;Column 1 fails to parse..... | |
;;incdebug.core> (mapv #(mp/get-1d % 1) rows) | |
IndexOutOfBoundsException clojure.lang.RT.nthFrom (RT.java:921) | |
;;incdebug.core> (mapv #(mp/get-1d % 1) rows) | |
;;so, if we collect the 1th entry of every row, we can find out which | |
;;row doesn't have an entry... | |
#_(->> (for [i (range (count rows))] [ i (count (mp/get-1d rows i))]) | |
(reduce (fn [acc [row count]] | |
(let [res (get acc count #{})] | |
(->> (conj res row) | |
(assoc acc count)))) | |
{})) | |
;;We see that row 650 only has ONE entry (the summary field mentioned before). | |
;;This is turning up as the entry for the first column, which parsed. | |
;;Every other column index beyond 0 will throw an index out of bound | |
;;exception on this row. | |
;;This violates the assumptions that core.matrix is making about the shape | |
;;of the data flowing to get-columns.... | |
#_{144 #{0 558 453 584 487 637 519 357 275 530 389 586 410 433 521 451 | |
291 443 249 638 299 121 287 65 70 218 648 62 74 475 497 580 164 282 | |
273 186 430 641 529 370 233 298 188 240 110 130 620 311 128 399 377 | |
468 259 210 229 153 621 213 343 472 7 59 473 86 613 491 154 20 224 355 | |
592 610 571 466 72 454 463 58 205 555 552 60 459 175 322 510 27 352 | |
493 416 1 631 69 101 24 547 102 385 135 397 490 354 360 55 568 269 448 | |
527 206 165 387 85 615 225 297 39 274 88 217 46 508 149 415 239 478 | |
157 345 300 4 550 204 470 646 77 106 197 405 518 232 260 267 119 319 | |
534 222 603 293 95 450 329 144 504 505 176 471 349 512 192 54 92 221 | |
141 502 464 307 290 627 517 361 264 137 356 327 234 104 353 15 48 242 | |
50 557 251 394 116 585 583 75 437 516 159 99 540 645 479 630 281 402 | |
429 309 458 21 388 495 626 31 113 32 407 398 136 139 506 396 460 483 | |
589 581 174 578 331 363 284 208 305 182 256 514 619 485 214 193 635 | |
442 561 607 241 314 226 235 420 418 262 263 304 401 40 129 600 467 445 | |
317 294 91 364 515 412 553 341 117 523 172 601 108 156 358 308 649 531 | |
223 419 365 181 417 278 56 33 13 22 618 380 257 338 500 168 496 347 | |
501 596 90 237 292 109 216 191 498 375 525 367 143 178 640 247 328 391 | |
167 36 41 474 187 551 528 599 376 195 316 428 303 368 560 565 310 366 | |
118 522 150 313 384 567 238 196 162 393 184 219 461 89 100 426 604 477 | |
541 351 243 131 629 122 43 231 61 598 413 29 151 369 348 575 44 258 | |
250 539 301 424 93 6 573 408 563 616 111 28 456 374 608 548 538 411 | |
134 64 623 465 334 323 189 280 198 155 295 248 587 285 507 227 476 494 | |
220 103 611 170 51 25 261 201 590 489 166 447 34 252 325 594 436 535 | |
146 228 306 125 276 340 148 482 622 588 17 312 606 3 520 286 279 536 | |
12 440 332 330 382 152 544 642 435 342 2 66 484 439 236 556 373 142 | |
359 371 444 570 107 532 23 230 625 47 526 180 537 158 350 35 644 127 | |
383 533 302 564 566 82 76 492 215 97 277 19 335 597 57 609 202 68 452 | |
200 11 115 339 431 462 337 255 503 546 9 632 457 427 145 5 624 244 289 | |
112 414 179 344 481 245 378 266 324 446 524 254 404 617 283 513 572 83 | |
634 138 346 14 455 265 449 333 639 569 326 647 45 53 559 78 562 542 | |
315 480 132 26 123 203 392 577 140 321 441 268 16 320 133 288 381 605 | |
163 81 120 643 79 211 38 173 126 421 593 636 98 422 423 614 582 554 | |
409 574 595 124 171 87 169 160 30 400 509 207 434 194 511 73 486 336 | |
96 10 272 499 488 386 270 576 543 271 18 395 403 469 105 185 52 545 | |
633 114 253 612 628 209 147 425 67 296 602 318 161 372 406 438 71 579 | |
42 80 199 591 37 183 432 379 63 212 94 362 8 246 190 549 177 49 390 | |
84}, | |
1 #{650}} | |
;;Note: per the c.m.m documentation for dataset-from-rows.... | |
;;"rows should be a sequence of data rows, where each row should be a sequence or | |
;;vector that has one element for each column." | |
;;So, core.matrix documents this. The data being fed to | |
;;core.matrix.impl.dataset/dataset-from-rows is violating the | |
;;assumptions! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment