Created
May 11, 2020 11:07
-
-
Save pansapiens/0b909afaae0f26610281f315053bd55d to your computer and use it in GitHub Desktop.
k-mer counts as input feature vectors in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# k-mer count tables as input vectors\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from typing import Sequence\n", | |
"import collections\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"k = 3\n", | |
"seq = \"AAABBBXXXYXXCCCXXXAAAXXXBCDEFXXYXXY\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter({'XXX': 3, 'AAA': 2, 'XXY': 2, 'XYX': 2, 'YXX': 2, 'AAB': 1, 'ABB': 1, 'BBB': 1, 'BBX': 1, 'BXX': 1, 'XXC': 1, 'XCC': 1, 'CCC': 1, 'CCX': 1, 'CXX': 1, 'XXA': 1, 'XAA': 1, 'AAX': 1, 'AXX': 1, 'XXB': 1, 'XBC': 1, 'BCD': 1, 'CDE': 1, 'DEF': 1, 'EFX': 1, 'FXX': 1})\n", | |
"Counter({'XXYX': 2, 'XYXX': 2, 'AAAB': 1, 'AABB': 1, 'ABBB': 1, 'BBBX': 1, 'BBXX': 1, 'BXXX': 1, 'XXXY': 1, 'YXXC': 1, 'XXCC': 1, 'XCCC': 1, 'CCCX': 1, 'CCXX': 1, 'CXXX': 1, 'XXXA': 1, 'XXAA': 1, 'XAAA': 1, 'AAAX': 1, 'AAXX': 1, 'AXXX': 1, 'XXXB': 1, 'XXBC': 1, 'XBCD': 1, 'BCDE': 1, 'CDEF': 1, 'DEFX': 1, 'EFXX': 1, 'FXXY': 1})\n", | |
"Counter({'XXYXX': 2, 'AAABB': 1, 'AABBB': 1, 'ABBBX': 1, 'BBBXX': 1, 'BBXXX': 1, 'BXXXY': 1, 'XXXYX': 1, 'XYXXC': 1, 'YXXCC': 1, 'XXCCC': 1, 'XCCCX': 1, 'CCCXX': 1, 'CCXXX': 1, 'CXXXA': 1, 'XXXAA': 1, 'XXAAA': 1, 'XAAAX': 1, 'AAAXX': 1, 'AAXXX': 1, 'AXXXB': 1, 'XXXBC': 1, 'XXBCD': 1, 'XBCDE': 1, 'BCDEF': 1, 'CDEFX': 1, 'DEFXX': 1, 'EFXXY': 1, 'FXXYX': 1})\n" | |
] | |
} | |
], | |
"source": [ | |
"def kmer_count(seq: Sequence, k=3) -> collections.Counter:\n", | |
" kmer_counts = collections.Counter()\n", | |
" for i in range(len(seq) - k):\n", | |
" kmer_counts.update([seq[i: i+k]])\n", | |
" \n", | |
" return kmer_counts\n", | |
"\n", | |
"print(kmer_count(seq, k=3))\n", | |
"print(kmer_count(seq, k=4))\n", | |
"print(kmer_count(seq, k=5))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Sort alphabetically by kmer\n", | |
"ktuple = sorted(list(kmer_count(seq, k=3).items()), key = lambda x: x[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>AAA</th>\n", | |
" <th>AAB</th>\n", | |
" <th>AAX</th>\n", | |
" <th>ABB</th>\n", | |
" <th>AXX</th>\n", | |
" <th>BBB</th>\n", | |
" <th>BBX</th>\n", | |
" <th>BCD</th>\n", | |
" <th>BXX</th>\n", | |
" <th>CCC</th>\n", | |
" <th>...</th>\n", | |
" <th>XAA</th>\n", | |
" <th>XBC</th>\n", | |
" <th>XCC</th>\n", | |
" <th>XXA</th>\n", | |
" <th>XXB</th>\n", | |
" <th>XXC</th>\n", | |
" <th>XXX</th>\n", | |
" <th>XXY</th>\n", | |
" <th>XYX</th>\n", | |
" <th>YXX</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>my_seq_id1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>...</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1 rows × 26 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" AAA AAB AAX ABB AXX BBB BBX BCD BXX CCC ... XAA XBC \\\n", | |
"my_seq_id1 2 1 1 1 1 1 1 1 1 1 ... 1 1 \n", | |
"\n", | |
" XCC XXA XXB XXC XXX XXY XYX YXX \n", | |
"my_seq_id1 1 1 1 1 3 2 2 2 \n", | |
"\n", | |
"[1 rows x 26 columns]" | |
] | |
}, | |
"execution_count": 51, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Convert into a DataFrame\n", | |
"\n", | |
"kmerseqs = []\n", | |
"counts = []\n", | |
"for ks, c in ktuple:\n", | |
" kmerseqs.append(ks)\n", | |
" counts.append(c)\n", | |
" \n", | |
"pd.DataFrame([counts], columns=kmerseqs, index=['my_seq_id1'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment