We profile k-mer pairwise comparision for k=[1,12] using the kMer analysis toolkit and programming library for k-mer profiles.
See the IPython notebook for more information.
| .ipynb_checkpoints | |
| *.fa | |
| [abcde].k1 | |
| [abcde].k2 | |
| [abcde].k3 | |
| [abcde].k4 | |
| [abcde].k5 | |
| [abcde].k6 | |
| [abcde].k7 | |
| [abcde].k8 | |
| [abcde].k9 | |
| [abcde].k10 | |
| [abcde].k11 | |
| [abcde].k12 | |
| [abcde].k13 | |
| [abcde].k14 | |
| [abcde].k15 |
| 0.542464971542 26120 |
| 0.663958072662 42492 |
| 0.972708940506 91608 |
| 2.24748587608 288264 |
| 7.51165699959 1074688 |
| 27.2333760262 4220276 |
| 108.990545034 16803304 |
| 0.541092157364 26120 |
| 0.547954082489 26120 |
| 0.578961849213 26120 |
| 0.578741788864 26144 |
| 0.586236000061 26196 |
| 0.567816019058 26360 |
| 0.570757865906 27144 |
| 0.593155145645 30204 |
| 5.48644304276 26120 |
| 5.7480931282 42528 |
| 6.13077187538 91608 |
| 7.46066999435 288280 |
| 13.2450871468 1074684 |
| 34.4764480591 4220288 |
| 116.609779835 16803324 |
| 5.44484806061 26128 |
| 5.53405499458 26124 |
| 5.82787203789 26128 |
| 5.84015107155 26152 |
| 5.82632112503 26292 |
| 5.79324197769 26752 |
| 5.74274420738 27332 |
| 5.71768307686 30264 |
| 54.6297440529 26116 |
| 57.8999540806 43588 |
| 58.0587861538 91868 |
| 60.5846529007 288448 |
| 72.2292790413 1074680 |
| 93.4170598984 4220460 |
| 174.824126005 16803412 |
| 54.5379779339 26120 |
| 54.4764060974 26120 |
| 57.6308250427 26124 |
| 58.1498789787 26144 |
| 57.4757890701 26264 |
| 57.5336329937 26756 |
| 57.6689498425 28424 |
| 58.0697989464 32832 |
| 0.00376987457275 27948 |
| 0.248427867889 121492 |
| 1.09074687958 397148 |
| 4.51135492325 1499740 |
| 17.9062130451 5669460 |
| 61.405629158 20096264 |
| 199.224109173 63755916 |
| 0.00374603271484 27944 |
| 0.00378704071045 27944 |
| 0.0042028427124 28036 |
| 0.00458002090454 28204 |
| 0.00510907173157 28544 |
| 0.00836801528931 29532 |
| 0.0185878276825 34008 |
| 0.0714981555939 52820 |
| 0.00386595726013 27948 |
| 0.253719091415 122176 |
| 1.1847550869 397036 |
| 4.87614607811 1499924 |
| 19.3254239559 5670424 |
| 68.5020990372 20095424 |
| 216.661634207 63754052 |
| 0.00380110740662 27936 |
| 0.00382900238037 27936 |
| 0.00426483154297 28044 |
| 0.00455498695374 28208 |
| 0.00516986846924 28472 |
| 0.00827503204346 29584 |
| 0.0180509090424 33936 |
| 0.0715341567993 52776 |
| 0.00382304191589 27940 |
| 11.709302187 122136 |
| 46.8763818741 392168 |
| 191.472046852 1367092 |
| 694.941875935 4249648 |
| 1946.50338984 13000560 |
| 5011.18195319 42178040 |
| 0.00398802757263 27940 |
| 0.00459003448486 27940 |
| 0.00705599784851 27952 |
| 0.0167629718781 28212 |
| 0.0539407730103 28568 |
| 0.206032991409 29584 |
| 0.806213140488 33940 |
| 3.21424412727 52784 |
| 0.0038058757782 27944 |
| 11.3999941349 121544 |
| 44.8743479252 392128 |
| 209.745757103 1367460 |
| 775.451279879 4249036 |
| 2038.04450011 13000368 |
| 4487.34653401 39471116 |
| 0.00396203994751 27940 |
| 0.00455594062805 27944 |
| 0.00734496116638 28048 |
| 0.0167670249939 28208 |
| 0.0530319213867 28560 |
| 0.208234071732 29528 |
| 0.775527000427 34012 |
| 3.10026812553 52828 |
| #!/usr/bin/env python | |
| import resource | |
| import sys | |
| import timeit | |
| from k_mer import klib | |
| def index(filename, k): | |
| with open(filename) as f: | |
| profile = klib.Profile.from_fasta(f, k) | |
| def profile_index(filename, k): | |
| repeats = timeit.repeat('index({0}, {1})'.format(repr(filename), k), | |
| setup='from __main__ import index', | |
| repeat=3, number=1) | |
| usage_time = min(repeats) | |
| usage_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss | |
| return usage_time, usage_memory | |
| if __name__ == '__main__': | |
| if len(sys.argv) != 3: | |
| sys.stdout.write('Usage: %s SAMPLE K\n' % sys.argv[0]) | |
| sys.exit(1) | |
| usage_time, usage_memory = profile_index(sys.argv[1], int(sys.argv[2])) | |
| print usage_time, usage_memory |
| #!/bin/bash | |
| for fasta in a.fa b.fa c.fa; do | |
| # Copy it to local storage | |
| local_fasta=$(mktemp) | |
| cp $fasta $local_fasta | |
| # Read it once, so we have no cache effects | |
| ./profile_count.py $local_fasta 4 > /dev/null | |
| for k in {1..15}; do | |
| ./profile_count.py $local_fasta $k > ${fasta%.fa}.count.k${k} | |
| done | |
| rm $local_fasta | |
| done |
| #!/usr/bin/env python | |
| import resource | |
| import sys | |
| import timeit | |
| from k_mer import ProfileFileType, kdistlib, klib | |
| def distance(filename_a, filename_b, **diff_args): | |
| with ProfileFileType()(filename_a) as f_a, ProfileFileType()(filename_b) as f_b: | |
| profile_a = klib.Profile.from_file(f_a) | |
| profile_b = klib.Profile.from_file(f_b) | |
| if profile_a.length != profile_b.length: | |
| raise ValueError('trying to compare profiles for different k') | |
| dist = kdistlib.ProfileDistance(**diff_args) | |
| dist.distance(profile_a, profile_b) | |
| def profile_distance(filename_a, filename_b, **diff_args): | |
| repeats = timeit.repeat('distance({0}, {1}, **{2})'.format(repr(filename_a), | |
| repr(filename_b), | |
| repr(diff_args)), | |
| setup='from __main__ import distance', | |
| repeat=3, number=1) | |
| usage_time = min(repeats) | |
| usage_memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss | |
| return usage_time, usage_memory | |
| if __name__ == '__main__': | |
| if len(sys.argv) != 4: | |
| sys.stdout.write('Usage: %s PROFILE_LEFT PROFILE_RIGHT MODE\n' % sys.argv[0]) | |
| sys.exit(1) | |
| if sys.argv[3] not in ('default', 'scale', 'smooth', 'scale.smooth'): | |
| sys.stdout.write('Not a valid mode: %s\n' % sys.argv[3]) | |
| sys.exit(1) | |
| if sys.argv[3] == 'default': | |
| usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2]) | |
| elif sys.argv[3] == 'scale': | |
| usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_scale=True) | |
| elif sys.argv[3] == 'smooth': | |
| usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_smooth=True, threshold=10) | |
| elif sys.argv[3] == 'scale.smooth': | |
| usage_time, usage_memory = profile_distance(sys.argv[1], sys.argv[2], do_scale=True, do_smooth=True, threshold=10) | |
| print usage_time, usage_memory |
| #!/bin/bash | |
| # Copy fasta files to local storage to speed things up | |
| local_fasta_d=$(mktemp) | |
| local_fasta_e=$(mktemp) | |
| cp d.fa $local_fasta_d | |
| cp e.fa $local_fasta_e | |
| for k in {1..15}; do | |
| # Create profiles | |
| kMer count -k $k d.k${k} $local_fasta_d | |
| kMer count -k $k e.k${k} $local_fasta_e | |
| # Copy profiles to local storage | |
| local_d=$(mktemp) | |
| local_e=$(mktemp) | |
| cp d.k${k} $local_d | |
| cp e.k${k} $local_e | |
| # Read them once, so we have no cache effects | |
| kMer info $local_d > /dev/null | |
| kMer info $local_e > /dev/null | |
| ./profile_distance.py $local_d $local_e default > distance.default.k${k} | |
| ./profile_distance.py $local_d $local_e scale > distance.scale.k${k} | |
| ./profile_distance.py $local_d $local_e smooth > distance.smooth.k${k} | |
| ./profile_distance.py $local_d $local_e scale.smooth > distance.scale.smooth.k${k} | |
| rm $local_d $local_e | |
| done | |
| rm $local_fasta_d $local_fasta_e |