Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Created May 29, 2024 17:42
Show Gist options
  • Save PatWalters/547adc9884eefbb5ca35393a4fdbee47 to your computer and use it in GitHub Desktop.
Save PatWalters/547adc9884eefbb5ca35393a4fdbee47 to your computer and use it in GitHub Desktop.
A simple example showing how to generate random hexapeptides
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 145,
"id": "83eb7100",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from io import StringIO\n",
"import mols2grid\n",
"from rdkit import Chem\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 183,
"id": "92d438c5",
"metadata": {},
"outputs": [],
"source": [
"aa_table = \"\"\"Name,Abbreviation,Code,SMILES\n",
"Alanine,Ala,A,C[C@H](N)C(=O)O\n",
"Arginine,Arg,R,N=C(N)NCCC[C@H](N)C(=O)O\n",
"Asparagine,Asn,N,NC(=O)C[C@H](N)C(=O)O\n",
"Aspartic acid,Asp,D,N[C@@H](CC(=O)O)C(=O)O\n",
"Cysteine,Cys,C,N[C@@H](CS)C(=O)O\n",
"Glutamine,Gln,Q,NC(=O)CC[C@H](N)C(=O)O\n",
"Glutamic acid,Glu,E,N[C@@H](CCC(=O)O)C(=O)O\n",
"Glycine,Gly,G,NCC(=O)O\n",
"Histidine,His,H,N[C@@H](Cc1c[nH]cn1)C(=O)O\n",
"Isoleucine,Ile,I,CC[C@H](C)[C@H](N)C(=O)O\n",
"Leucine,Leu,L,CC(C)C[C@H](N)C(=O)O\n",
"Lysine,Lys,K,NCCCC[C@H](N)C(=O)O\n",
"Methionine,Met,M,CSCC[C@H](N)C(=O)O\n",
"Phenylalanine,Phe,F,N[C@@H](Cc1ccccc1)C(=O)O\n",
"Proline,Pro,P,O=C(O)[C@@H]1CCCN1\n",
"Serine,Ser,S,N[C@@H](CO)C(=O)O\n",
"Threonine,Thr,T,C[C@@H](O)[C@H](N)C(=O)O\n",
"Tryptophan,Trp,W,N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O\n",
"Tyrosine,Tyr,Y,N[C@@H](Cc1ccc(O)cc1)C(=O)O\n",
"Valine,Val,V,CC(C)[C@H](N)C(=O)O\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 184,
"id": "c172deb3",
"metadata": {},
"outputs": [],
"source": [
"aa_df = pd.read_csv(StringIO(aa_table))"
]
},
{
"cell_type": "code",
"execution_count": 185,
"id": "5cb2f47b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Name</th>\n",
" <th>Abbreviation</th>\n",
" <th>Code</th>\n",
" <th>SMILES</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Alanine</td>\n",
" <td>Ala</td>\n",
" <td>A</td>\n",
" <td>C[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Arginine</td>\n",
" <td>Arg</td>\n",
" <td>R</td>\n",
" <td>N=C(N)NCCC[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Asparagine</td>\n",
" <td>Asn</td>\n",
" <td>N</td>\n",
" <td>NC(=O)C[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Aspartic acid</td>\n",
" <td>Asp</td>\n",
" <td>D</td>\n",
" <td>N[C@@H](CC(=O)O)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Cysteine</td>\n",
" <td>Cys</td>\n",
" <td>C</td>\n",
" <td>N[C@@H](CS)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Glutamine</td>\n",
" <td>Gln</td>\n",
" <td>Q</td>\n",
" <td>NC(=O)CC[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Glutamic acid</td>\n",
" <td>Glu</td>\n",
" <td>E</td>\n",
" <td>N[C@@H](CCC(=O)O)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Glycine</td>\n",
" <td>Gly</td>\n",
" <td>G</td>\n",
" <td>NCC(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Histidine</td>\n",
" <td>His</td>\n",
" <td>H</td>\n",
" <td>N[C@@H](Cc1c[nH]cn1)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Isoleucine</td>\n",
" <td>Ile</td>\n",
" <td>I</td>\n",
" <td>CC[C@H](C)[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Leucine</td>\n",
" <td>Leu</td>\n",
" <td>L</td>\n",
" <td>CC(C)C[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Lysine</td>\n",
" <td>Lys</td>\n",
" <td>K</td>\n",
" <td>NCCCC[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Methionine</td>\n",
" <td>Met</td>\n",
" <td>M</td>\n",
" <td>CSCC[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Phenylalanine</td>\n",
" <td>Phe</td>\n",
" <td>F</td>\n",
" <td>N[C@@H](Cc1ccccc1)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Proline</td>\n",
" <td>Pro</td>\n",
" <td>P</td>\n",
" <td>O=C(O)[C@@H]1CCCN1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Serine</td>\n",
" <td>Ser</td>\n",
" <td>S</td>\n",
" <td>N[C@@H](CO)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Threonine</td>\n",
" <td>Thr</td>\n",
" <td>T</td>\n",
" <td>C[C@@H](O)[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Tryptophan</td>\n",
" <td>Trp</td>\n",
" <td>W</td>\n",
" <td>N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Tyrosine</td>\n",
" <td>Tyr</td>\n",
" <td>Y</td>\n",
" <td>N[C@@H](Cc1ccc(O)cc1)C(=O)O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Valine</td>\n",
" <td>Val</td>\n",
" <td>V</td>\n",
" <td>CC(C)[C@H](N)C(=O)O</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Name Abbreviation Code SMILES\n",
"0 Alanine Ala A C[C@H](N)C(=O)O\n",
"1 Arginine Arg R N=C(N)NCCC[C@H](N)C(=O)O\n",
"2 Asparagine Asn N NC(=O)C[C@H](N)C(=O)O\n",
"3 Aspartic acid Asp D N[C@@H](CC(=O)O)C(=O)O\n",
"4 Cysteine Cys C N[C@@H](CS)C(=O)O\n",
"5 Glutamine Gln Q NC(=O)CC[C@H](N)C(=O)O\n",
"6 Glutamic acid Glu E N[C@@H](CCC(=O)O)C(=O)O\n",
"7 Glycine Gly G NCC(=O)O\n",
"8 Histidine His H N[C@@H](Cc1c[nH]cn1)C(=O)O\n",
"9 Isoleucine Ile I CC[C@H](C)[C@H](N)C(=O)O\n",
"10 Leucine Leu L CC(C)C[C@H](N)C(=O)O\n",
"11 Lysine Lys K NCCCC[C@H](N)C(=O)O\n",
"12 Methionine Met M CSCC[C@H](N)C(=O)O\n",
"13 Phenylalanine Phe F N[C@@H](Cc1ccccc1)C(=O)O\n",
"14 Proline Pro P O=C(O)[C@@H]1CCCN1\n",
"15 Serine Ser S N[C@@H](CO)C(=O)O\n",
"16 Threonine Thr T C[C@@H](O)[C@H](N)C(=O)O\n",
"17 Tryptophan Trp W N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O\n",
"18 Tyrosine Tyr Y N[C@@H](Cc1ccc(O)cc1)C(=O)O\n",
"19 Valine Val V CC(C)[C@H](N)C(=O)O"
]
},
"execution_count": 185,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"aa_df"
]
},
{
"cell_type": "code",
"execution_count": 193,
"id": "b9ab6306",
"metadata": {},
"outputs": [],
"source": [
"backbone = \"NC(X)C(=O)NC(X)C(=O)NC(X)C(=O)NC(X)C(=O)NC(X)C(=O)NC(X)C(=O)O\""
]
},
{
"cell_type": "code",
"execution_count": 194,
"id": "821c45b2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len([x for x in backbone if x == \"X\"])"
]
},
{
"cell_type": "code",
"execution_count": 195,
"id": "8b8adefa",
"metadata": {},
"outputs": [],
"source": [
"peptide = Chem.MolFromFASTA(\"\".join(aa_df.Code.sample(6).values))"
]
},
{
"cell_type": "code",
"execution_count": 196,
"id": "82d2ac6d",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<rdkit.Chem.rdchem.Mol at 0x131a7ca50>"
]
},
"execution_count": 196,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"peptide"
]
},
{
"cell_type": "code",
"execution_count": 197,
"id": "04ac974c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CC[C@H](C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@@H](N)CCSC)C(=O)N[C@H](C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](C)C(=O)O)[C@@H](C)O\n",
"CC(C)[C@H](NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](CS)NC(=O)[C@H](C)N)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O\n",
"C[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CCC(=O)O)C(=O)NCC(=O)N1CCC[C@H]1C(=O)O\n",
"CC[C@H](C)[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O\n",
"C[C@@H](O)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@@H](N)CC(N)=O)C(=O)N[C@@H](CS)C(=O)N[C@@H](CC(=O)O)C(=O)O\n",
"CC[C@H](C)[C@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CO)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)N[C@@H](C)C(=O)O\n",
"CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)CNC(=O)[C@H](CC(=O)O)NC(=O)[C@@H](N)CCCNC(=N)N)[C@@H](C)O)C(C)C)C(=O)O\n",
"CSCC[C@H](NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](N)CO)C(=O)O\n",
"CC(C)[C@H](NC(=O)[C@H](CO)NC(=O)[C@@H](N)CCCCN)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CC(=O)O)C(=O)O\n",
"C[C@@H](O)[C@H](N)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@@H](CCCCN)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](CCC(N)=O)C(=O)O\n"
]
}
],
"source": [
"for i in range(0,10):\n",
" peptide = Chem.MolFromFASTA(\"\".join(aa_df.Code.sample(6).values))\n",
" print(Chem.MolToSmiles(peptide))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd4ce464",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment