Created
August 27, 2019 23:24
-
-
Save epifanio/21d457e05ec0decc4e1202b356a9757e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Data preparation\n", | |
"\n", | |
"\n", | |
"## File list\n", | |
"The directory structure of the actual annotations is quite odd.\n", | |
"Below some path magics to extract the filenames for both: images and xml annotations." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pathlib import Path, PosixPath" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import itertools" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# set the path to where the annotations are" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"annotations = '/home/epinux/annotate2/'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"jpeg_files = [str(i) for i in itertools.chain.from_iterable(\n", | |
" [list(i.glob('*.jpg')) for i in itertools.chain.from_iterable(\n", | |
" [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"len(jpeg_files)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"xml_files = [str(i) for i in itertools.chain.from_iterable(\n", | |
" [list(i.glob('*.xml')) for i in itertools.chain.from_iterable(\n", | |
" [sorted(i.glob('*')) for i in sorted(Path(annotations).glob('*'))])])]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Subsampling\n", | |
"\n", | |
"Reduce the `xml` annotation files with a reduce factor $R_f$\n", | |
"Starting value:\n", | |
"\n", | |
"$$\n", | |
"R_f=0.125 \\quad \\text{which will reduce the amount of files to 12.5% of the total}\n", | |
"$$" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"reduce_factor = 0.125\n", | |
"np.random.seed(0)\n", | |
"msk = np.random.rand(len(xml_files), ) < reduce_factor\n", | |
"sample = list(np.array(xml_files)[msk])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"len(sample)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sample[:5]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Annotation Parsing\n", | |
"\n", | |
"Parsing each `xml` file and store the resutls as `pandas.Dataframe`\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import glob\n", | |
"import pandas as pd\n", | |
"import xml.etree.ElementTree as ET" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def xml_to_csv(xml_files):\n", | |
" xml_list = []\n", | |
" for xml_file in xml_files:\n", | |
" tree = ET.parse(xml_file)\n", | |
" root = tree.getroot()\n", | |
" for member in root.findall('object'):\n", | |
" value = (root.find('filename').text,\n", | |
" int(root.find('size')[0].text),\n", | |
" int(root.find('size')[1].text),\n", | |
" member[0].text,\n", | |
" int(member[4][0].text),\n", | |
" int(member[4][1].text),\n", | |
" int(member[4][2].text),\n", | |
" int(member[4][3].text)\n", | |
" )\n", | |
" xml_list.append(value)\n", | |
" column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']\n", | |
" xml_df = pd.DataFrame(xml_list, columns=column_name)\n", | |
" return xml_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"records = xml_to_csv(sample)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Get an idea of which labels are in all the annotations" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"list(records['class'].unique())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"records.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"records.describe()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Simple statistic description of the sample" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"plt.style.use('ggplot')\n", | |
"\n", | |
"plt.figure(figsize=(20,10))\n", | |
"records['class'].value_counts().plot(kind='bar')\n", | |
"plt.tight_layout()\n", | |
"plt.show()\n", | |
"plt.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# import holoviews as hv\n", | |
"# hv.extension('bokeh')\n", | |
"# bars = hv.Bars(records['class'].value_counts(), hv.Dimension('index'), 'class').options(width=900, height=500, xrotation= 38)\n", | |
"# bars" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Extract only the `sand dollars` annotations" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"records = records[records['class']==\"sand dollar\"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Train & Test\n", | |
"\n", | |
"Split the sampling in training ($70\\%$) and testing ($30\\%$) dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"msk = np.random.rand(len(records)) < 0.7\n", | |
"train = records[msk]\n", | |
"test = records[~msk]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"test.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"test.to_csv('test.csv', index=False)\n", | |
"train.to_csv('train.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## TensorFlow records\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# for converting the csv/pandas dataframe into TFRecord format\n", | |
"# https://stackoverflow.com/questions/41402332/tensorflow-create-a-tfrecords-file-from-csv\n", | |
"\n", | |
"import pandas as pd\n", | |
"import tensorflow as tf\n", | |
"import numpy as np\n", | |
"\n", | |
"import warnings\n", | |
"warnings.filterwarnings('ignore')\n", | |
"\n", | |
"# The function takes in the all of the features of a single annotation instance as a list, and then also the label as its own variable\n", | |
"# it creates a TFRecord (see below cell for how the format looks, similar to XML)\n", | |
"\n", | |
"def create_tf_example(features, label):\n", | |
"\n", | |
" tf_example = tf.train.Example(features=tf.train.Features(feature={\n", | |
" 'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[0].encode('utf-8')])),\n", | |
" 'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[1])])),\n", | |
" 'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[2])])),\n", | |
" 'class':tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode('utf-8')])),\n", | |
" 'xmin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[3])])),\n", | |
" 'ymin':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[4])])),\n", | |
" 'xmax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[5])])),\n", | |
" 'ymax':tf.train.Feature(int64_list=tf.train.Int64List(value=[int(features[6])])),\n", | |
" }))\n", | |
" return tf_example\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Loops through all of the rows in the pandas dataframe and individually converts each annotation instance into the TFRecord format\n", | |
"# note that this loop breaks after a single annotation just to provide an example. Remove the break when actually using!\n", | |
"# Saves/writes the output in root folder\n", | |
"with tf.python_io.TFRecordWriter(\"dataset.tfrecords\") as writer:\n", | |
" for index, row in train.iterrows():\n", | |
" features = np.array(list(row[0:3].values) + list(row[4:].values)) \n", | |
" # All of the features in the row, minus the class label\n", | |
" label = row[3] \n", | |
" # just the class label\n", | |
" example = create_tf_example(features, label) \n", | |
" # creates a TFRecord\n", | |
" writer.write(example.SerializeToString())\n", | |
" # break for example, remove otherwise to run entire set\n", | |
" \n", | |
"writer.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"example" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!gist test.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!gist train.csv" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!gist 01_data_preparation.ipynb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment