pgolding · May 2, 2018 20:39
diff --git a/get-check-metadata-Copy1.ipynb b/get-check-metadata-Copy1.ipynb
diff --git a/get-check-metadata.ipynb b/get-check-metadata.ipynb
 {
 "cells": [
  {
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "cell_type": "markdown",
   "source": "# Check Image Meta Data (e.g. from EXIF)\n\nThis script pulls the EXIF data from a JPG image. For now, we skip PDF/PNG images although they do contain metadata.\n\nRequires:\n\n>`$ pip install exifread`\n\nSet `mount` to point to the mountpoint.\n\nSet `path_to_checks` to point at the sub-folder with the checks\n\nThe final path is the join of these two - i.e. `mount/path_to_checks`\n\nWe filter mostly for useful EXIF data fields (`cols`). But to grab all EXIF data set `limit_to_cols=False` when calling the `stream_exif` routine."
  },
  {
   "metadata": {
    "trusted": true,
    "deletable": true,
    "collapsed": false,
    "editable": true
   },
   "cell_type": "code",
   "source": "import os\nimport json\nimport sys\nimport exifread\nimport pandas\n\n# Control the checkpoint size - number of images to process and save at a time\ncheck_point = 100\n\n# EXIF tags that we want to extract and use as cols in our csv output\ncols = [\"EXIF ISOSpeedRatings\",\"EXIF Flash\",\"Image YResolution\",\"EXIF FNumber\",\"Image Orientation\",\"Image Model\",\n        \"Image XResolution\",\"Image XResolution\",\"EXIF BrightnessValue\",\"EXIF ExifImageWidth\",\"EXIF ExifImageLength\",\n        \"EXIF LensModel\", \"EXIF SubjectArea\", \"Image Make\"]\n\n# Exif data streamed to here:\ndf = pandas.DataFrame()\n\ndef save(data,csv_file):\n    # Now create a frame for post-processing and/or CSV export\n    df = pandas.DataFrame(data)\n    df.set_index('file', inplace=True)\n    df.to_csv(csv_file)\n\n# Set limit_to_cols to False if you want to collect ALL the EXIF fields (will slow things down)\ndef stream_exif(path,limit_to_cols=True,csv_file='Exif_data_checks.csv'):\n    #path = os.getcwd()  #current directory, if running script from checks folder\n    print(\"Looking for jpg images in {}\".format(path))\n    exif_stream = []\n    try:\n        extensions = [\"jpg\",\"jpeg\"]\n        filelist = filter(lambda f: f.split('.')[-1].lower() in extensions, os.listdir(path))\n        filelist = sorted(filelist)\n        if len(list(filelist)) == 0:\n            print(\"Unable to find any jpg images - check folder or path is correct\")\n            raise\n        else:\n            print(\"Found {} jpg images\".format(len(list(filelist))))\n        filelist = sorted(filelist)\n        file_count = 0\n        for file in filelist:\n            if file.endswith(\".jpg\"): \n                with open(os.path.join(path,file), 'rb') as f:\n                    tags = exifread.process_file(f, details=False)\n                    if limit_to_cols:\n                        vals = {tag: field for (tag, field) in tags.items() if tag in cols}\n                    else:\n                        vals = {tag: field for (tag, field) in tags.items()}\n                    vals[\"file\"] = file\n                    exif_stream.append(vals)\n            file_count += 1\n            if file_count % check_point == 0:\n                save(exif_stream,csv_file)\n        if len(exif_stream) > 0:\n            save(exif_stream,csv_file) \n        print(\"Checks processed: {}\".format(file_count))\n    except OSError as err:\n        print(\"OS error: {0}\".format(err))\n    except KeyError as err:\n        print(\"KeyError: {} - check that the folder is correct\".format(err))\n    except Exception as err:\n        print(err)\n\n# change this to the folder mount point\nmount = os.getcwd()\n# change this to the checks folder\npath_to_checks = 'sample_checks_prosper'\nfolder = os.path.join(mount,path_to_checks)\n# iterate over the files and stream the exif data\nstream_exif(folder,csv_file='all_checks.csv')\n",
   "execution_count": null,
   "outputs": []
  },
  {
   "metadata": {
    "trusted": true,
    "deletable": true,
    "collapsed": true,
    "editable": true
   },
   "cell_type": "code",
   "source": "",
   "execution_count": null,
   "outputs": []
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.4",
   "file_extension": ".py",
   "mimetype": "text/x-python"
  },
  "gist_id": "4d05e400d0c62f22e6f8de3488aa1a2f",
  "kernelspec": {
   "name": "conda-env-ocr-py",
   "display_name": "Python [conda env:ocr]",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"cell_type": "markdown",
	"source": "# Check Image Meta Data (e.g. from EXIF)\n\nThis script pulls the EXIF data from a JPG image. For now, we skip PDF/PNG images although they do contain metadata.\n\nRequires:\n\n>`$ pip install exifread`\n\nSet `mount` to point to the mountpoint.\n\nSet `path_to_checks` to point at the sub-folder with the checks\n\nThe final path is the join of these two - i.e. `mount/path_to_checks`\n\nWe filter mostly for useful EXIF data fields (`cols`). But to grab all EXIF data set `limit_to_cols=False` when calling the `stream_exif` routine."
	},
	{
	"metadata": {
	"trusted": true,
	"deletable": true,
	"collapsed": false,
	"editable": true
	},
	"cell_type": "code",
	"source": "import os\nimport json\nimport sys\nimport exifread\nimport pandas\n\n# Control the checkpoint size - number of images to process and save at a time\ncheck_point = 100\n\n# EXIF tags that we want to extract and use as cols in our csv output\ncols = [\"EXIF ISOSpeedRatings\",\"EXIF Flash\",\"Image YResolution\",\"EXIF FNumber\",\"Image Orientation\",\"Image Model\",\n \"Image XResolution\",\"Image XResolution\",\"EXIF BrightnessValue\",\"EXIF ExifImageWidth\",\"EXIF ExifImageLength\",\n \"EXIF LensModel\", \"EXIF SubjectArea\", \"Image Make\"]\n\n# Exif data streamed to here:\ndf = pandas.DataFrame()\n\ndef save(data,csv_file):\n # Now create a frame for post-processing and/or CSV export\n df = pandas.DataFrame(data)\n df.set_index('file', inplace=True)\n df.to_csv(csv_file)\n\n# Set limit_to_cols to False if you want to collect ALL the EXIF fields (will slow things down)\ndef stream_exif(path,limit_to_cols=True,csv_file='Exif_data_checks.csv'):\n #path = os.getcwd() #current directory, if running script from checks folder\n print(\"Looking for jpg images in {}\".format(path))\n exif_stream = []\n try:\n extensions = [\"jpg\",\"jpeg\"]\n filelist = filter(lambda f: f.split('.')[-1].lower() in extensions, os.listdir(path))\n filelist = sorted(filelist)\n if len(list(filelist)) == 0:\n print(\"Unable to find any jpg images - check folder or path is correct\")\n raise\n else:\n print(\"Found {} jpg images\".format(len(list(filelist))))\n filelist = sorted(filelist)\n file_count = 0\n for file in filelist:\n if file.endswith(\".jpg\"): \n with open(os.path.join(path,file), 'rb') as f:\n tags = exifread.process_file(f, details=False)\n if limit_to_cols:\n vals = {tag: field for (tag, field) in tags.items() if tag in cols}\n else:\n vals = {tag: field for (tag, field) in tags.items()}\n vals[\"file\"] = file\n exif_stream.append(vals)\n file_count += 1\n if file_count % check_point == 0:\n save(exif_stream,csv_file)\n if len(exif_stream) > 0:\n save(exif_stream,csv_file) \n print(\"Checks processed: {}\".format(file_count))\n except OSError as err:\n print(\"OS error: {0}\".format(err))\n except KeyError as err:\n print(\"KeyError: {} - check that the folder is correct\".format(err))\n except Exception as err:\n print(err)\n\n# change this to the folder mount point\nmount = os.getcwd()\n# change this to the checks folder\npath_to_checks = 'sample_checks_prosper'\nfolder = os.path.join(mount,path_to_checks)\n# iterate over the files and stream the exif data\nstream_exif(folder,csv_file='all_checks.csv')\n",
	"execution_count": null,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true,
	"deletable": true,
	"collapsed": true,
	"editable": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"language_info": {
	"name": "python",
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.4",
	"file_extension": ".py",
	"mimetype": "text/x-python"
	},
	"gist_id": "4d05e400d0c62f22e6f8de3488aa1a2f",
	"kernelspec": {
	"name": "conda-env-ocr-py",
	"display_name": "Python [conda env:ocr]",
	"language": "python"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}