Last active
April 26, 2020 08:54
-
-
Save DataTurks/0cc13f59a423a9d4f039225bf4fc8e03 to your computer and use it in GitHub Desktop.
Covert Dataturks Image bounding box JSON to Pascal VOC format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import sys | |
import os | |
import json | |
import logging | |
import requests | |
from PIL import Image | |
################### INSTALLATION NOTE ####################### | |
############################################################## | |
## pip install requests | |
## pip install pillow | |
############################################################### | |
############################################################### | |
#enable info logging. | |
logging.getLogger().setLevel(logging.INFO) | |
def maybe_download(image_url, image_dir): | |
"""Download the image if not already exist, return the location path""" | |
fileName = image_url.split("/")[-1] | |
filePath = os.path.join(image_dir, fileName) | |
if (os.path.exists(filePath)): | |
return filePath | |
#else download the image | |
try: | |
response = requests.get(image_url) | |
if response.status_code == 200: | |
with open(filePath, 'wb') as f: | |
f.write(response.content) | |
return filePath | |
else: | |
raise ValueError( "Not a 200 response") | |
except Exception as e: | |
logging.exception("Failed to download image at " + image_url + " \n" + str(e) + "\nignoring....") | |
raise e | |
def get_xml_for_bbx(bbx_label, bbx_data, width, height): | |
if len(bbx_data['points']) == 4: | |
#Regular BBX has 4 points of the rectangle. | |
xmin = width*min(bbx_data['points'][0][0], bbx_data['points'][1][0], bbx_data['points'][2][0], bbx_data['points'][3][0]) | |
ymin = height * min(bbx_data['points'][0][1], bbx_data['points'][1][1], bbx_data['points'][2][1], | |
bbx_data['points'][3][1]) | |
xmax = width * max(bbx_data['points'][0][0], bbx_data['points'][1][0], bbx_data['points'][2][0], | |
bbx_data['points'][3][0]) | |
ymax = height * max(bbx_data['points'][0][1], bbx_data['points'][1][1], bbx_data['points'][2][1], | |
bbx_data['points'][3][1]) | |
else: | |
#OCR BBX format has 'x','y' in one point. | |
# We store the left top and right bottom as point '0' and point '1' | |
xmin = int(bbx_data['points'][0]['x']*width) | |
ymin = int(bbx_data['points'][0]['y']*height) | |
xmax = int(bbx_data['points'][1]['x']*width) | |
ymax = int(bbx_data['points'][1]['y']*height) | |
xml = "<object>\n" | |
xml = xml + "\t<name>" + bbx_label + "</name>\n" | |
xml = xml + "\t<pose>Unspecified</pose>\n" | |
xml = xml + "\t<truncated>Unspecified</truncated>\n" | |
xml = xml + "\t<difficult>Unspecified</difficult>\n" | |
xml = xml + "\t<occluded>Unspecified</occluded>\n" | |
xml = xml + "\t<bndbox>\n" | |
xml = xml + "\t\t<xmin>" + str(xmin) + "</xmin>\n" | |
xml = xml + "\t\t<xmax>" + str(xmax) + "</xmax>\n" | |
xml = xml + "\t\t<ymin>" + str(ymin) + "</ymin>\n" | |
xml = xml + "\t\t<ymax>" + str(ymax) + "</ymax>\n" | |
xml = xml + "\t</bndbox>\n" | |
xml = xml + "</object>\n" | |
return xml | |
def convert_to_PascalVOC(dataturks_labeled_item, image_dir, xml_out_dir): | |
"""Convert a dataturks labeled item to pascalVOCXML string. | |
Args: | |
dataturks_labeled_item: JSON of one labeled image from dataturks. | |
image_dir: Path to directory to downloaded images (or a directory already having the images downloaded). | |
xml_out_dir: Path to the dir where the xml needs to be written. | |
Returns: | |
None. | |
Raises: | |
None. | |
""" | |
try: | |
data = json.loads(dataturks_labeled_item) | |
if len(data['annotation']) == 0: | |
logging.info("Ignoring Skipped Item"); | |
return False; | |
width = data['annotation'][0]['imageWidth'] | |
height = data['annotation'][0]['imageHeight'] | |
image_url = data['content'] | |
filePath = maybe_download(image_url, image_dir) | |
with Image.open(filePath) as img: | |
width, height = img.size | |
fileName = filePath.split("/")[-1] | |
image_dir_folder_Name = image_dir.split("/")[-1] | |
xml = "<annotation>\n<folder>" + image_dir_folder_Name + "</folder>\n" | |
xml = xml + "<filename>" + fileName +"</filename>\n" | |
xml = xml + "<path>" + filePath +"</path>\n" | |
xml = xml + "<source>\n\t<database>Unknown</database>\n</source>\n" | |
xml = xml + "<size>\n" | |
xml = xml + "\t<width>" + str(width) + "</width>\n" | |
xml = xml + "\t<height>" + str(height) + "</height>\n" | |
xml = xml + "\t<depth>Unspecified</depth>\n" | |
xml = xml + "</size>\n" | |
xml = xml + "<segmented>Unspecified</segmented>\n" | |
for bbx in data['annotation']: | |
if not bbx: | |
continue; | |
#Pascal VOC only supports rectangles. | |
if "shape" in bbx and bbx["shape"] != "rectangle": | |
continue; | |
bbx_labels = bbx['label'] | |
#handle both list of labels or a single label. | |
if not isinstance(bbx_labels, list): | |
bbx_labels = [bbx_labels] | |
for bbx_label in bbx_labels: | |
xml = xml + get_xml_for_bbx(bbx_label, bbx, width, height) | |
xml = xml + "</annotation>" | |
#output to a file. | |
xmlFilePath = os.path.join(xml_out_dir, fileName + ".xml") | |
with open(xmlFilePath, 'w') as f: | |
f.write(xml) | |
return True | |
except Exception as e: | |
logging.exception("Unable to process item " + dataturks_labeled_item + "\n" + "error = " + str(e)) | |
return False | |
def main(): | |
#make sure everything is setup. | |
if (not os.path.isdir(image_download_dir)): | |
logging.exception("Please specify a valid directory path to download images, " + image_download_dir + " doesn't exist") | |
return | |
if (not os.path.isdir(pascal_voc_xml_dir)): | |
logging.exception("Please specify a valid directory path to write Pascal VOC xml files, " + pascal_voc_xml_dir + " doesn't exist") | |
return | |
if (not os.path.exists(dataturks_JSON_FilePath)): | |
logging.exception( | |
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " doesn't exist") | |
return | |
lines = [] | |
with open(dataturks_JSON_FilePath, 'r') as f: | |
lines = f.readlines() | |
if (not lines or len(lines) == 0): | |
logging.exception( | |
"Please specify a valid path to dataturks JSON output file, " + dataturks_JSON_FilePath + " is empty") | |
return | |
count = 0; | |
success = 0 | |
for line in lines: | |
status = convert_to_PascalVOC(line, image_download_dir, pascal_voc_xml_dir) | |
if (status): | |
success = success + 1 | |
count+=1; | |
if (count % 10 == 0): | |
logging.info(str(count) + " items done ...") | |
logging.info("Completed: " + str(success) + " items done, " + str(len(lines) - success) + " items ignored due to errors or for being skipped items.") | |
def create_arg_parser(): | |
""""Creates and returns the ArgumentParser object.""" | |
parser = argparse.ArgumentParser(description='Converts Dataturks output JSON file for Image bounding box to Pascal VOC format.') | |
parser.add_argument('dataturks_JSON_FilePath', | |
help='Path to the JSON file downloaded from Dataturks.') | |
parser.add_argument('image_download_dir', | |
help='Path to the directory where images will be dowloaded (if not already found in the directory).') | |
parser.add_argument('pascal_voc_xml_dir', | |
help='Path to the directory where Pascal VOC XML files will be stored.') | |
return parser | |
if __name__ == '__main__': | |
arg_parser = create_arg_parser() | |
parsed_args = arg_parser.parse_args(sys.argv[1:]) | |
global dataturks_JSON_FilePath | |
global image_download_dir | |
global pascal_voc_xml_dir | |
#setup global paths needed accross the script. | |
dataturks_JSON_FilePath = parsed_args.dataturks_JSON_FilePath | |
image_download_dir = parsed_args.image_download_dir | |
pascal_voc_xml_dir = parsed_args.pascal_voc_xml_dir | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
i am getting error in line 141 in function convert_to_pascal. Error is in line with open(xmlfilepath,'w') in command prompt.Please help