icook · April 6, 2016 14:21
diff --git a/hash_test.ipynb b/hash_test.ipynb

 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Experiment in reverse video search\n",
    "\n",
    "Below I attempt to use minhash and opencv to build a usable minhash for finding duplicate or similar videos. Our test tries modifying a video in several ways and testing how well it matches in addition to testing how well it matches against 6 differnet videos."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import cv2\n",
    "import sys\n",
    "import time\n",
    "from datasketch import MinHash\n",
    "from hashlib import sha1\n",
    "\n",
    "\n",
    "def minhash_of(filename):\n",
    "    \"\"\" Compute the minhash of a video file. This function computes a color\n",
    "    histogram for each frame of a video, then performs a simple bucket\n",
    "    operations (making a slightly fuzzy frame hash). This value is fed into\n",
    "    minhash and then we compute the jaccard similarity of the videos. \"\"\"\n",
    "    cap = cv2.VideoCapture(filename)\n",
    "    m1 = MinHash()\n",
    "    total = int(cap.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))  # Total frame count\n",
    "    \n",
    "    while 1:\n",
    "        ret, frame = cap.read()\n",
    "        if not ret:\n",
    "            break\n",
    "        hist = cv2.calcHist([frame],[0],None,[512],[0,512])\n",
    "        # Normalize. This is because the histogram's scale depends on the resolution of the image.\n",
    "        hist = np.true_divide(hist, hist.max())\n",
    "        # Bucket the number slightly to account for some variance.\n",
    "        # A larger divisor here will increase how many matches are made.\n",
    "        hist = np.sum(hist)\n",
    "        # add our frame to the minhash\n",
    "        m1.digest(sha1(str(round(hist))))\n",
    "        m1.digest(sha1(str(round(hist * 3))))\n",
    "        current = cap.get(cv2.cv.CV_CAP_PROP_POS_FRAMES)\n",
    "        sys.stdout.write(\"{} / {}\\r\".format(current, total))  # Print a progress bar\n",
    "\n",
    "    cap.release()\n",
    "    cv2.destroyAllWindows()\n",
    "    \n",
    "    return m1, total\n",
    "\n",
    "def compare(filename, mh, desc):\n",
    "    start = time.time()\n",
    "    other, total = minhash_of(filename)\n",
    "    print(\"{:.2f}\\t{}\\t({:.2f} fps)\".format(mh.jaccard(other), desc, total / (time.time() - start)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.89\tcompressed 144p\t(4776.13 fps)\n",
      "0.85\tvisible gamma change\t(993.65 fps)\n",
      "0.89\twatermark added\t(1001.44 fps)\n",
      "0.85\tsubclip 15% length\t(990.81 fps)\n",
      "0.27\tsubclip 0.5% length\t(698.35 fps)\n",
      "0.90\tfps 29 vs 23\t(994.44 fps)\n",
      "0.91\tfps 29 vs 10\t(906.13 fps)\n"
     ]
    }
   ],
   "source": [
    "vid1, _ = minhash_of('vid1_full.mp4') # 480p 29.97fps\n",
    "compare('vid1_compressed.mp4', vid1, \"compressed 144p\")\n",
    "compare('vid1_gamma_neg20.mp4', vid1, \"visible gamma change\")\n",
    "compare('vid1_watermark.mp4', vid1, \"watermark added\")\n",
    "compare('vid1_clip_30.mp4', vid1, \"subclip 15% length\")\n",
    "compare('vid1_clip_2.mp4', vid1, \"subclip 0.5% length\")\n",
    "compare('vid1_change_fps.mp4', vid1, \"fps 29 vs 23\") # from 29 to 23\n",
    "compare('vid1_change_fps_low.mp4', vid1, \"fps 29 vs 10\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.66\tdifferent video 2 compressed 144p\t(2611.61 fps)\n",
      "0.70\tdifferent video 2\t(1484.69 fps)\n",
      "0.05\tdifferent video 3\t(1464.13 fps)\n",
      "0.91\tdifferent video 4\t(1314.44 fps)\n",
      "0.89\tdifferent video 5\t(1602.97 fps)\n",
      "0.80\tdifferent video 6\t(1203.70 fps)\n"
     ]
    }
   ],
   "source": [
    "compare('vid2_compressed.mp4', vid1, \"different video 2 compressed 144p\")\n",
    "for i in range(2, 7):\n",
    "    compare('vid{}.mp4'.format(i), vid1, \"different video {}\".format(i))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As can be seen above, this simple technique did a reasonable job of finding duplicates after a range of changes. It handled subclips poorly, and had one-two false positives. By isolating more features and tuning the bucketing technique better results could be obtained."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }

	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Experiment in reverse video search\n",
	"\n",
	"Below I attempt to use minhash and opencv to build a usable minhash for finding duplicate or similar videos. Our test tries modifying a video in several ways and testing how well it matches in addition to testing how well it matches against 6 differnet videos."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 112,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import cv2\n",
	"import sys\n",
	"import time\n",
	"from datasketch import MinHash\n",
	"from hashlib import sha1\n",
	"\n",
	"\n",
	"def minhash_of(filename):\n",
	" \"\"\" Compute the minhash of a video file. This function computes a color\n",
	" histogram for each frame of a video, then performs a simple bucket\n",
	" operations (making a slightly fuzzy frame hash). This value is fed into\n",
	" minhash and then we compute the jaccard similarity of the videos. \"\"\"\n",
	" cap = cv2.VideoCapture(filename)\n",
	" m1 = MinHash()\n",
	" total = int(cap.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT)) # Total frame count\n",
	" \n",
	" while 1:\n",
	" ret, frame = cap.read()\n",
	" if not ret:\n",
	" break\n",
	" hist = cv2.calcHist([frame],[0],None,[512],[0,512])\n",
	" # Normalize. This is because the histogram's scale depends on the resolution of the image.\n",
	" hist = np.true_divide(hist, hist.max())\n",
	" # Bucket the number slightly to account for some variance.\n",
	" # A larger divisor here will increase how many matches are made.\n",
	" hist = np.sum(hist)\n",
	" # add our frame to the minhash\n",
	" m1.digest(sha1(str(round(hist))))\n",
	" m1.digest(sha1(str(round(hist * 3))))\n",
	" current = cap.get(cv2.cv.CV_CAP_PROP_POS_FRAMES)\n",
	" sys.stdout.write(\"{} / {}\\r\".format(current, total)) # Print a progress bar\n",
	"\n",
	" cap.release()\n",
	" cv2.destroyAllWindows()\n",
	" \n",
	" return m1, total\n",
	"\n",
	"def compare(filename, mh, desc):\n",
	" start = time.time()\n",
	" other, total = minhash_of(filename)\n",
	" print(\"{:.2f}\\t{}\\t({:.2f} fps)\".format(mh.jaccard(other), desc, total / (time.time() - start)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 113,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.89\tcompressed 144p\t(4776.13 fps)\n",
	"0.85\tvisible gamma change\t(993.65 fps)\n",
	"0.89\twatermark added\t(1001.44 fps)\n",
	"0.85\tsubclip 15% length\t(990.81 fps)\n",
	"0.27\tsubclip 0.5% length\t(698.35 fps)\n",
	"0.90\tfps 29 vs 23\t(994.44 fps)\n",
	"0.91\tfps 29 vs 10\t(906.13 fps)\n"
	]
	}
	],
	"source": [
	"vid1, _ = minhash_of('vid1_full.mp4') # 480p 29.97fps\n",
	"compare('vid1_compressed.mp4', vid1, \"compressed 144p\")\n",
	"compare('vid1_gamma_neg20.mp4', vid1, \"visible gamma change\")\n",
	"compare('vid1_watermark.mp4', vid1, \"watermark added\")\n",
	"compare('vid1_clip_30.mp4', vid1, \"subclip 15% length\")\n",
	"compare('vid1_clip_2.mp4', vid1, \"subclip 0.5% length\")\n",
	"compare('vid1_change_fps.mp4', vid1, \"fps 29 vs 23\") # from 29 to 23\n",
	"compare('vid1_change_fps_low.mp4', vid1, \"fps 29 vs 10\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 114,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.66\tdifferent video 2 compressed 144p\t(2611.61 fps)\n",
	"0.70\tdifferent video 2\t(1484.69 fps)\n",
	"0.05\tdifferent video 3\t(1464.13 fps)\n",
	"0.91\tdifferent video 4\t(1314.44 fps)\n",
	"0.89\tdifferent video 5\t(1602.97 fps)\n",
	"0.80\tdifferent video 6\t(1203.70 fps)\n"
	]
	}
	],
	"source": [
	"compare('vid2_compressed.mp4', vid1, \"different video 2 compressed 144p\")\n",
	"for i in range(2, 7):\n",
	" compare('vid{}.mp4'.format(i), vid1, \"different video {}\".format(i))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"As can be seen above, this simple technique did a reasonable job of finding duplicates after a range of changes. It handled subclips poorly, and had one-two false positives. By isolating more features and tuning the bucketing technique better results could be obtained."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}