Skip to content

Instantly share code, notes, and snippets.

@olgabot
Created October 18, 2017 20:34
Show Gist options
  • Save olgabot/7433f8ad938ba70f6c9fbd36ec4d0c66 to your computer and use it in GitHub Desktop.
Save olgabot/7433f8ad938ba70f6c9fbd36ec4d0c66 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Icon? i5_05-i7_06.csv i5_10-i7_11.csv i5_15-i7_16.csv i5_20-i7_01.csv\r\n",
"i5_01-i7_02.csv i5_06-i7_07.csv i5_11-i7_12.csv i5_16-i7_17.csv\r\n",
"i5_02-i7_03.csv i5_07-i7_08.csv i5_12-i7_13.csv i5_17-i7_18.csv\r\n",
"i5_03-i7_04.csv i5_08-i7_09.csv i5_13-i7_14.csv i5_18-i7_19.csv\r\n",
"i5_04-i7_05.csv i5_09-i7_10.csv i5_14-i7_15.csv i5_19-i7_20.csv\r\n"
]
}
],
"source": [
"input_folder = '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates'\n",
"! ls $input_folder"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_01-i7_02.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_02-i7_03.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_03-i7_04.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_04-i7_05.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_05-i7_06.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_06-i7_07.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_07-i7_08.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_08-i7_09.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_09-i7_10.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_10-i7_11.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_11-i7_12.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_12-i7_13.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_13-i7_14.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_14-i7_15.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_15-i7_16.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_16-i7_17.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_17-i7_18.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_18-i7_19.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_19-i7_20.csv',\n",
" '/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/i5_20-i7_01.csv']"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"csvs = glob.glob(os.path.join(input_folder, 'i5*csv'))\n",
"csvs"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"20\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(384, 5)\n",
"(7680, 5)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sample_ID</th>\n",
" <th>Sample_Name</th>\n",
" <th>index_name</th>\n",
" <th>index</th>\n",
" <th>index2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>M2-MAA000377-3_9_M-1</td>\n",
" <td>M2-MAA000377-3_9_M-1</td>\n",
" <td>CZB-NXT-i7-00385-CZB-NXT-i5-00001</td>\n",
" <td>CCACACAAGAGA</td>\n",
" <td>TCAATGACTAAA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>M10-MAA000377-3_9_M-1</td>\n",
" <td>M10-MAA000377-3_9_M-1</td>\n",
" <td>CZB-NXT-i7-00386-CZB-NXT-i5-00002</td>\n",
" <td>AGCCAATGTGGG</td>\n",
" <td>GGTGATAGACGC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>M15-MAA000377-3_9_M-1</td>\n",
" <td>M15-MAA000377-3_9_M-1</td>\n",
" <td>CZB-NXT-i7-00387-CZB-NXT-i5-00003</td>\n",
" <td>GTCAGATACCAC</td>\n",
" <td>CTCTTGTATCTT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>M16-MAA000377-3_9_M-1</td>\n",
" <td>M16-MAA000377-3_9_M-1</td>\n",
" <td>CZB-NXT-i7-00388-CZB-NXT-i5-00004</td>\n",
" <td>TTAGAGGGATGG</td>\n",
" <td>CAAATAACAGCA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>N4-MAA000377-3_9_M-1</td>\n",
" <td>N4-MAA000377-3_9_M-1</td>\n",
" <td>CZB-NXT-i7-00389-CZB-NXT-i5-00005</td>\n",
" <td>ATAAGCCTTCTG</td>\n",
" <td>ATGTGTATCCTC</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Sample_ID Sample_Name \\\n",
"0 M2-MAA000377-3_9_M-1 M2-MAA000377-3_9_M-1 \n",
"1 M10-MAA000377-3_9_M-1 M10-MAA000377-3_9_M-1 \n",
"2 M15-MAA000377-3_9_M-1 M15-MAA000377-3_9_M-1 \n",
"3 M16-MAA000377-3_9_M-1 M16-MAA000377-3_9_M-1 \n",
"4 N4-MAA000377-3_9_M-1 N4-MAA000377-3_9_M-1 \n",
"\n",
" index_name index index2 \n",
"0 CZB-NXT-i7-00385-CZB-NXT-i5-00001 CCACACAAGAGA TCAATGACTAAA \n",
"1 CZB-NXT-i7-00386-CZB-NXT-i5-00002 AGCCAATGTGGG GGTGATAGACGC \n",
"2 CZB-NXT-i7-00387-CZB-NXT-i5-00003 GTCAGATACCAC CTCTTGTATCTT \n",
"3 CZB-NXT-i7-00388-CZB-NXT-i5-00004 TTAGAGGGATGG CAAATAACAGCA \n",
"4 CZB-NXT-i7-00389-CZB-NXT-i5-00005 ATAAGCCTTCTG ATGTGTATCCTC "
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(len(csvs))\n",
"\n",
"dfs = []\n",
"\n",
"for csv in sorted(csvs):\n",
"# ! tail $csv\n",
" df = pd.read_excel(csv, skiprows=1)\n",
" print(df.shape)\n",
"# print(df.head())\n",
" dfs.append(df)\n",
"\n",
"combined = pd.concat(dfs)\n",
"print(combined.shape)\n",
"combined.head()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/Users/olgabot/googledrive/MACA/MACA_Optimization_Experiments/NonCombo_IndexPrimers/sample_sheets_novaseq2/index_templates/combined.csv\n"
]
}
],
"source": [
"csv = os.path.join(input_folder, 'combined.csv')\n",
"print(csv)\n",
"\n",
"# index=False to not write the row numbers on the left\n",
"combined.to_csv(csv, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Add the `[Data],,,,` header and combine into a separate file"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"! echo '[Data],,,,' | cat - $input_folder/combined.csv > $input_folder/combined_with_header.csv"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Data],,,,\r\n",
"Sample_ID,Sample_Name,index_name,index,index2\r\n",
"M2-MAA000377-3_9_M-1,M2-MAA000377-3_9_M-1,CZB-NXT-i7-00385-CZB-NXT-i5-00001,CCACACAAGAGA,TCAATGACTAAA\r\n",
"M10-MAA000377-3_9_M-1,M10-MAA000377-3_9_M-1,CZB-NXT-i7-00386-CZB-NXT-i5-00002,AGCCAATGTGGG,GGTGATAGACGC\r\n",
"M15-MAA000377-3_9_M-1,M15-MAA000377-3_9_M-1,CZB-NXT-i7-00387-CZB-NXT-i5-00003,GTCAGATACCAC,CTCTTGTATCTT\r\n",
"M16-MAA000377-3_9_M-1,M16-MAA000377-3_9_M-1,CZB-NXT-i7-00388-CZB-NXT-i5-00004,TTAGAGGGATGG,CAAATAACAGCA\r\n",
"N4-MAA000377-3_9_M-1,N4-MAA000377-3_9_M-1,CZB-NXT-i7-00389-CZB-NXT-i5-00005,ATAAGCCTTCTG,ATGTGTATCCTC\r\n",
"N7-MAA000377-3_9_M-1,N7-MAA000377-3_9_M-1,CZB-NXT-i7-00390-CZB-NXT-i5-00006,GACCTCACGCCT,TTCCGAGTCATA\r\n",
"N11-MAA000377-3_9_M-1,N11-MAA000377-3_9_M-1,CZB-NXT-i7-00391-CZB-NXT-i5-00007,TGCTGGTGGCTA,TAGAGCGGGCGG\r\n",
"N15-MAA000377-3_9_M-1,N15-MAA000377-3_9_M-1,CZB-NXT-i7-00392-CZB-NXT-i5-00008,GAACACCGAAGA,GGAACCGCAGCG\r\n"
]
}
],
"source": [
"! head $input_folder/combined_with_header.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment