georgy7 · November 27, 2015 02:30
diff --git a/neuraltalk_data_from_exif_keywords.py b/neuraltalk_data_from_exif_keywords.py
 # pip install exifread

 import exifread
 import array
 import string
 import glob
 import json

 KEYWORDS = 'Image XPKeywords'
 OUTPUT = 'neuraltalk.json'

 def filter_str(str):
  s = ''.join(filter(lambda x: (x in string.printable) and (x not in '\t\n\r'), str))
  return s.strip()

 def tags(filename):
  try:
    f = open(filename, 'rb')
    exif = exifread.process_file(f)
    tags = array.array('B', exif[KEYWORDS].values).tostring().decode('utf-16').split(';')
    tags = [filter_str(t) for t in tags]
    return tags
  except Exception as e:
    print(filename, 'has no keywords.')
    return []

 def save(arr):
  f = open(OUTPUT, 'w')
  json.dump(arr, f, sort_keys=True, indent=2, separators=(',', ': '))
  f.close()
  print('Finished.')

 def main():
  files = glob.glob('*.[jJ][pP][gG]') + glob.glob('*.[jJ][pP][eE][gG]')
  result = []
  for filename in files:
    image_tags = tags(filename)
    if len(image_tags) > 0:
      result.append({'file_path': filename, 'captions': image_tags})
  save(result)

 if __name__ == "__main__":
  main()
diff --git a/neuraltalk_train.sh b/neuraltalk_train.sh
 NEUROTALK=../neuraltalk2
 JSON_OUTPUT=neuraltalk_output.json
 H5_OUTPUT=neuraltalk.h5
 CHECKPOINTS_OUTPUT=./checkpoints

 # Making the paths absolute.
 JSON_OUTPUT=`readlink -f $JSON_OUTPUT`
 H5_OUTPUT=`readlink -f $H5_OUTPUT`
 CHECKPOINTS_OUTPUT=`readlink -f $CHECKPOINTS_OUTPUT`
 START_FOLDER=`readlink -f .`
 CNN_MODEL=`readlink -f $NEUROTALK/model/VGG_ILSVRC_16_layers.caffemodel`
 CNN_PROTO=`readlink -f $NEUROTALK/model/VGG_ILSVRC_16_layers_deploy.prototxt`

 if [ ! -f $H5_OUTPUT ] || [ ! -f $JSON_OUTPUT ]; then
  rm -f $H5_OUTPUT
  rm -f $JSON_OUTPUT
  python $NEUROTALK/prepro.py \
      --input_json neuraltalk.json \
      --num_val 5000 --num_test 5000 \
      --images_root . \
      --word_count_threshold 5 \
      --output_json $JSON_OUTPUT \
      --output_h5 $H5_OUTPUT
 fi

 mkdir -p $CHECKPOINTS_OUTPUT
 cd $NEUROTALK
 th train.lua -input_h5 $H5_OUTPUT -input_json $JSON_OUTPUT -checkpoint_path $CHECKPOINTS_OUTPUT \
    -gpuid -1 \
    -cnn_model $CNN_MODEL \
    -cnn_proto $CNN_PROTO
 cd $START_FOLDER
	# pip install exifread

	import exifread
	import array
	import string
	import glob
	import json

	KEYWORDS = 'Image XPKeywords'
	OUTPUT = 'neuraltalk.json'

	def filter_str(str):
	s = ''.join(filter(lambda x: (x in string.printable) and (x not in '\t\n\r'), str))
	return s.strip()

	def tags(filename):
	try:
	f = open(filename, 'rb')
	exif = exifread.process_file(f)
	tags = array.array('B', exif[KEYWORDS].values).tostring().decode('utf-16').split(';')
	tags = [filter_str(t) for t in tags]
	return tags
	except Exception as e:
	print(filename, 'has no keywords.')
	return []

	def save(arr):
	f = open(OUTPUT, 'w')
	json.dump(arr, f, sort_keys=True, indent=2, separators=(',', ': '))
	f.close()
	print('Finished.')

	def main():
	files = glob.glob('.[jJ][pP][gG]') + glob.glob('.[jJ][pP][eE][gG]')
	result = []
	for filename in files:
	image_tags = tags(filename)
	if len(image_tags) > 0:
	result.append({'file_path': filename, 'captions': image_tags})
	save(result)

	if __name__ == "__main__":
	main()
	NEUROTALK=../neuraltalk2
	JSON_OUTPUT=neuraltalk_output.json
	H5_OUTPUT=neuraltalk.h5
	CHECKPOINTS_OUTPUT=./checkpoints

	# Making the paths absolute.
	JSON_OUTPUT=`readlink -f $JSON_OUTPUT`
	H5_OUTPUT=`readlink -f $H5_OUTPUT`
	CHECKPOINTS_OUTPUT=`readlink -f $CHECKPOINTS_OUTPUT`
	START_FOLDER=`readlink -f .`
	CNN_MODEL=`readlink -f $NEUROTALK/model/VGG_ILSVRC_16_layers.caffemodel`
	CNN_PROTO=`readlink -f $NEUROTALK/model/VGG_ILSVRC_16_layers_deploy.prototxt`

	if [ ! -f $H5_OUTPUT ] \|\| [ ! -f $JSON_OUTPUT ]; then
	rm -f $H5_OUTPUT
	rm -f $JSON_OUTPUT
	python $NEUROTALK/prepro.py \
	--input_json neuraltalk.json \
	--num_val 5000 --num_test 5000 \
	--images_root . \
	--word_count_threshold 5 \
	--output_json $JSON_OUTPUT \
	--output_h5 $H5_OUTPUT
	fi

	mkdir -p $CHECKPOINTS_OUTPUT
	cd $NEUROTALK
	th train.lua -input_h5 $H5_OUTPUT -input_json $JSON_OUTPUT -checkpoint_path $CHECKPOINTS_OUTPUT \
	-gpuid -1 \
	-cnn_model $CNN_MODEL \
	-cnn_proto $CNN_PROTO
	cd $START_FOLDER