Skip to content

Instantly share code, notes, and snippets.

@Theldus
Last active August 14, 2021 05:18
Show Gist options
  • Save Theldus/c9c59a97aa50fe9d41433906316e6404 to your computer and use it in GitHub Desktop.
Save Theldus/c9c59a97aa50fe9d41433906316e6404 to your computer and use it in GitHub Desktop.
Simple perceptual hash (dHash) implementation in C (with stb_image)
#!/usr/bin/env sh
#
# Usage:
# ./index_files.sh files*
# ./index_files.sh < files.txt
# cat files.txt | ./index_files.sh..... (you get it)
#
percep "$@" | tee index_files.txt
cut -d' ' -f1 < index_files.txt > hash_files.txt
/*
* MIT License
*
* Copyright (c) 2021 Davidson Francis <[email protected]>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define _POSIX_C_SOURCE 200809L
#include <errno.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <getopt.h>
#ifdef __TINYC__
#define STBI_NO_SIMD
#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb/stb_image.h"
#define STB_IMAGE_RESIZE_IMPLEMENTATION
#include "stb/stb_image_resize.h"
/*
* Simple perceptual hash (dHash) based on:
* http://www.hackerfactor.com/blog/?/archives/529-Kind-of-Like-That.html
*
* Build instructions:
* gcc percep.c -o percep -lm -Wall -Wextra -std=c99 -pedantic -O3
*
* Depedencies: stb_image.h and stb_image_resize.h
* (optionally stb_image_write.h):
*
* https://raw.githubusercontent.com/nothings/stb/master/stb_image.h
* https://raw.githubusercontent.com/nothings/stb/master/stb_image_resize.h
* https://raw.githubusercontent.com/nothings/stb/master/stb_image_write.h
*/
/* Enable if want to generate the 8x8 grayscale & the BxW version. */
#define DUMP_INTERMEDIATE_IMAGES 0
#if DUMP_INTERMEDIATE_IMAGES == 1
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "stb/stb_image_write.h"
#endif
/* Flags. */
#define OPT_COMPARE 1
#define OPT_ONLY_SAME 2
/* Pixel utilities. */
#define WIDTH 9
#define HEIGHT 8
#define CHANNELS 1
static int MAX_DISTANCE = 8; /* or 8/64, 1/8 pixels. */
#define M_intensity(i,j) \
( ((((j)*HEIGHT)+(i))*CHANNELS)+0 )
/**
* @brief Exhibits a formatted message on stderr and aborts
* the program.
*
* @param fmt Formatted message, just like printf.
*/
static inline void panic(const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
vfprintf(stderr, fmt, args);
va_end(args);
exit(EXIT_FAILURE);
}
/**
* @brief Calculates the hamming distance between
* two 64-bit values.
*
* @param n1 First number.
* @param n2 Second number.
*
* @return Return the number of bits set.
*/
static int hamming(uint64_t n1, uint64_t n2)
{
int setbit;
uint64_t x;
x = n1 ^ n2;
setbit = 0;
while (x > 0)
{
setbit += (x & 1);
x >>= 1;
}
return (setbit);
}
/**
* Simple perceptual hash algorithm:
* - Resize to 8x8
* - Grayscale
* - Convert to BxW through average intensity and them create the hash.
*
* @param file Input image file to be read.
* @param err Error indicator, if any.
*
* @return Returns a 64-bit hash that corresponds to the input file.
*/
static uint64_t perceptual_hash(const char *file, int *err)
{
uint8_t out[WIDTH * HEIGHT * 1]; /* Our buffer is small enough to fit on
* stack... right? =). */
uint64_t hash;
uint8_t *img;
int ret;
int w,h,c;
*err = 1;
hash = 0;
/* Load image with 1-channel already. */
img = stbi_load(file, &w, &h, &c, 1);
if (!img)
goto out;
/* Resize to 8x8. */
ret = stbir_resize_uint8(img, w, h, 0,
out, WIDTH, HEIGHT, 0, 1);
if (!ret)
goto out;
#if DUMP_INTERMEDIATE_IMAGES == 1
stbi_write_jpg("img88gray.jpg", WIDTH, HEIGHT, 1, out, 100);
#endif
/* Compute hash. */
hash = 0;
for (int i = 0; i < HEIGHT; i++)
{
for (int j = 0; j < WIDTH - 1; j++)
{
int intensityL = out[M_intensity(i,j)];
int intensityR = out[M_intensity(i,j+1)];
int set = 0;
if (intensityL < intensityR)
set = 1;
hash = (hash << 1) | set;
}
}
*err = 0;
out:
stbi_image_free(img);
return (hash);
}
/**
* @brief For a given input, parse as an hex hash or invoke the
* perceptual hash algorithm.
*
* @param input Input hash or file.
* @param err Err output.
*/
static uint64_t get_hash(const char *input, int *err)
{
unsigned long long hash;
char *end;
int len;
*err = 0;
len = (int)strlen(input);
/* Check if file. */
if (len != 16)
goto maybe_file;
/* Maybe hash? */
errno = 0;
hash = strtoull(input, &end, 16);
/* input was not a number */
if (hash == 0 && end == input)
goto maybe_file;
/* the input value does not fit in unsigned long long */
else if (hash == ULLONG_MAX && errno)
goto maybe_file;
/* input began with a number but has junk left over at the end. */
else if (*end)
goto maybe_file;
return (hash);
maybe_file:
return (perceptual_hash(input, err));
}
/**
* Safe string-to-int routine that takes into account:
* - Overflow and Underflow
* - No undefined behaviour
*
* Taken from https://stackoverflow.com/a/12923949/3594716
* and slightly adapted: no error classification, because
* I dont need to know, error is error.
*
* @param out Pointer to integer.
* @param s String to be converted.
*
* @return Returns 0 if success and a negative number otherwise.
*/
static int str2int(int *out, char *s)
{
char *end;
if (s[0] == '\0' || isspace(s[0]))
return (-1);
errno = 0;
long l = strtol(s, &end, 10);
/* Both checks are needed because INT_MAX == LONG_MAX is possible. */
if (l > INT_MAX || (errno == ERANGE && l == LONG_MAX))
return (-1);
if (l < INT_MIN || (errno == ERANGE && l == LONG_MIN))
return (-1);
if (*end != '\0')
return (-1);
*out = l;
return (0);
}
/**
* @brief Usage options.
*
* @param prgnam Program name.
*/
static void usage(const char *prgnam)
{
fprintf(stderr,
"Usage: %s <image_or_hash> [image2_or_hash, imageN_or_hash...] [options]\n\n",
prgnam);
fprintf(stderr, "Options:\n");
fprintf(stderr, " -c Enable comparison mode (all files will be compared\n"
" against the first file)\n\n");
fprintf(stderr, " -s Same as -c, but only output files that are likely\n"
" to be the same (i.e, hamming_distance <= MAX_DISTANCE"
" (%d))\n\n", MAX_DISTANCE);
fprintf(stderr, " -d Max hamming distance ([0, 64]) (default: 8)\n\n");
fprintf(stderr, " -h This help\n\n");
fprintf(stderr, " Default mode: just output the hashes (like md5sum)\n"
" If no arguments, or when arg is '-', read from stdin.\n\n");
fprintf(stderr, "Note:\n");
fprintf(stderr,
"1) <image_or_hash> means that an input can be a regular file or a hash.\n"
" This serves as a way to facilitate comparisons where you already have\n"
" the hashes but not the file, or just want to speed up (significantly)\n"
" the process!\n\n"
" Also note that the identification is done automatically:\n"
" If the input has 16 digits long _and_ can be parsed as an hex number:\n"
" hash, otherwise: file.\n\n");
fprintf(stderr,
"2) If -s or -c is enabled, if all inputs were equal to the reference,\n"
" %s returns 0, otherwise, 1.\n", prgnam);
fprintf(stderr, "\nFiles supported: JPG, PNG, BMP, GIF, PPM\n");
exit(2);
}
/**
* @brief Parse argument list.
*
* @param argc Argument count.
* @param argv Argument list.
*
* @return Return the flags parsed or abort the program.
*/
static int parse_args(int argc, char **argv)
{
int flags; /* Parameters. */
int opt; /* Current arg. */
/* Arguments. */
flags = 0;
while ((opt = getopt(argc, argv, "hcsd:")) != -1)
{
switch (opt)
{
case 'h':
usage(argv[0]);
break;
case 'c':
if (flags & OPT_ONLY_SAME)
{
fprintf(stderr, "Options -c and -s are mutually exclusive!\n");
usage(argv[0]);
}
flags |= OPT_COMPARE;
break;
case 's':
if (flags & OPT_COMPARE)
{
fprintf(stderr, "Options -s and -c are mutually exclusive!\n");
usage(argv[0]);
}
flags |= OPT_ONLY_SAME;
break;
case 'd':
if (str2int(&MAX_DISTANCE, optarg) < 0
|| MAX_DISTANCE < 0 || MAX_DISTANCE > 64)
{
fprintf(stderr, "Invalid -d (%s), should be between 0"
" and 64 (inclusive)\n", optarg);
usage(argv[0]);
}
break;
default:
usage(argv[0]);
break;
}
}
return (flags);
}
/**
* @brief Read a next file, either from argv or stdin, and write to the
* buff. The buffer size is also updated to reflect the new buffer size.
*
* @param argc Argument count.
* @param argv Argument list.
* @param buff Buffer pointer (should be NULL at first invocation).
* @param size Buffer size (should be 0 in the firts invocation).
*
* @return Returns a pointer to the string buffer or NULL if no
* input is available.
*/
static char *next_file_or_hash(int argc, char **argv, char **buff,
size_t *size)
{
ssize_t ret; /* Getline return. */
size_t len; /* Input length. */
char *tmp; /* Temp value. */
((void)argc);
#define ST_ARGV 0
#define ST_STDIN 1
#define ST_INIT 2
static int state = ST_INIT;
if (state == ST_INIT)
{
if (argv[optind] != NULL && strcmp(argv[optind], "-"))
state = ST_ARGV;
else
{
state = ST_STDIN;
/* If our fist arg is '-', consume it. */
if (argv[optind] != NULL && !strcmp(argv[optind], "-"))
optind++;
}
}
read_argv:
if (state == ST_STDIN)
goto read_stdin;
/* If via arguments. */
if (!argv[optind])
return (NULL);
if (!strcmp(argv[optind], "-"))
{
state = ST_STDIN;
optind++;
goto read_stdin;
}
len = strlen(argv[optind]);
tmp = *buff;
if (*size < len + 1)
{
tmp = realloc(*buff, len + 1);
if (!tmp)
panic("Unable to allocate memory for next file\n");
*buff = tmp;
*size = len + 1;
}
strcpy(tmp, argv[optind]);
tmp[len] = '\0';
optind++;
return (*buff);
/* If stdin. */
read_stdin:
ret = getline(buff, size, stdin);
if (ret == -1)
{
state = ST_ARGV;
goto read_argv;
}
buff[0][ret - 1] = '\0';
return (*buff);
}
/* Main routine. */
int main(int argc, char **argv)
{
uint64_t hash_ref; /* Reference hash. */
uint64_t hash; /* Current hash. */
int num_equal; /* Number of files equal to the ref. */
size_t size; /* Current buffer size. */
char *buff; /* Current buffer. */
int flags; /* Parameters. */
int count; /* Input count (excluding ref file). */
int err; /* Err condition. */
int ret; /* Return value. */
int d; /* Current hamming distance. */
ret = 1;
buff = NULL;
size = 0;
count = 0;
num_equal = 0;
flags = parse_args(argc, argv);
/* Dump all hashes. */
buff = next_file_or_hash(argc, argv, &buff, &size);
hash_ref = get_hash(buff, &err);
if (err)
panic("Unable to handle image %s!\n", buff);
if (flags & (OPT_COMPARE|OPT_ONLY_SAME))
printf("%016" PRIx64 " %s (reference)\n", hash_ref, buff);
else
printf("%016" PRIx64 " %s\n", hash_ref, buff);
/* Loop through arguments and/or stdin. */
while (next_file_or_hash(argc, argv, &buff, &size) != NULL)
{
hash = get_hash(buff, &err);
if (err)
panic("Unable to handle image %s!\n", buff);
/* If comparison mode. */
if (flags & (OPT_COMPARE|OPT_ONLY_SAME))
{
d = hamming(hash, hash_ref);
if ((flags & OPT_COMPARE)
|| ((flags & OPT_ONLY_SAME) && d <= MAX_DISTANCE))
{
printf("%016" PRIx64 " %s, hamming distance: %d",
hash, buff, d);
}
if (d <= MAX_DISTANCE)
{
num_equal++;
printf(" (similar!, %d <= %d)\n", d, MAX_DISTANCE);
}
else if (flags & OPT_COMPARE)
puts("");
}
/* Just-hash mode. */
else
{
printf("%016" PRIx64 " %s\n", hash, buff);
}
count++;
}
if (flags & (OPT_COMPARE|OPT_ONLY_SAME))
printf("Veredict: %d/%d are equal to the reference image!\n",
num_equal, count);
/* Return 0 if all files are equal to the reference file. */
if (num_equal == count)
ret = 0;
free(buff);
return (ret);
}
#!/usr/bin/env sh
#
# Usage:
# ./search.sh image1
#
# Do search, get only hash and remove duplicates
percep -s "$@" - < hash_files.txt | cut -d' ' -f1 | sort -u > .tmp_search
# Iterate over .tmp_search looking for matches in index_files.txt
while IFS="" read -r p || [ -n "$p" ]
do
grep "$p" index_files.txt
done < .tmp_search
rm .tmp_search
@Theldus
Copy link
Author

Theldus commented Aug 14, 2021

Description & Usage

percep (for lack of a better name), calculates the dHash of a list of images, either via arguments or via stdin.

dHash belongs to a class of algorithms called 'perceptual hash', that is, algorithms that create a hash for a given content (be it image, sound, etc.) but that represent what is understandable to human beings: two images don't need to have all bits identical for one person to say they are equal. Perceptual hashing therefore allows small variations to continue to be accepted, even between different hashes, which makes them different from algorithms such as md5, sha256 and etc, which are quite unfeasible for comparing media files.

In addition to calculating the hash (md5sum style), percep also allows you to calculate the hamming distance of the first argument (reference to the others). The -c and -s modes allow to enable the calculation of hamming distance (-c) and filter only those below a threshold (-s).

The shorter the hamming distance, the closer the images are: a value of 0 indicates identical images, with identical hash, and '64' completely different images. The default threshold is 8, but it can be adjusted by the -d parameter.

Parameters

Usage: ./percep <image_or_hash> [image2_or_hash, imageN_or_hash...] [options]

Options:
  -c  Enable comparison mode (all files will be compared
      against the first file)

  -s  Same as -c, but only output files that are likely
      to be the same (i.e, hamming_distance <= MAX_DISTANCE (8))

  -d  Max hamming distance ([0, 64]) (default: 8)

  -h  This help

  Default mode: just output the hashes (like md5sum)
  If no arguments, or when arg is '-', read from stdin.

Note:
1) <image_or_hash> means that an input can be a regular file or a hash.
   This serves as a way to facilitate comparisons where you already have
   the hashes but not the file, or just want to speed up (significantly)
   the process!

   Also note that the identification is done automatically:
   If the input has 16 digits long _and_ can be parsed as an hex number:
   hash, otherwise: file.

2) If -s or -c is enabled, if all inputs were equal to the reference,
   ./percep returns 0, otherwise, 1.

Files supported: JPG, PNG, BMP, GIF, PPM

Examples of use:

$ ./percep img01.jpg 
580ec6e3b1d0e062  img01.jpg

$ ./percep img01.jpg img02.jpg 
580ec6e3b1d0e062  img01.jpg
588cc6e2b5d0e062  img02.jpg

$ ./percep img01.jpg img02.jpg img04.jpg -c
580ec6e3b1d0e062  img01.jpg (reference)
588cc6e2b5d0e062  img02.jpg, hamming distance: 4 (similar!, 4 <= 8)
d86cb48aa483e161  img04.jpg, hamming distance: 22
Veredict: 1/2 are equal to the reference image!

$ ./percep img01.jpg img02.jpg img04.jpg -s
580ec6e3b1d0e062  img01.jpg (reference)
588cc6e2b5d0e062  img02.jpg, hamming distance: 4 (similar!, 4 <= 8)
Veredict: 1/2 are equal to the reference image!

Given a list of files, percep can also read from stdin and save the hashes. Another interesting feature is the possibility for percep to read the hashes (instead of files) as a parameter (or stdin), which allows calculating distances from pre-calculated images to a reference one, something like:

$ ./percep img01.jpg 588cc6e2b5d0e062 d86cb48aa483e161 -c
580ec6e3b1d0e062  img01.jpg (reference)
588cc6e2b5d0e062  588cc6e2b5d0e062, hamming distance: 4 (similar!, 4 <= 8)
d86cb48aa483e161  d86cb48aa483e161, hamming distance: 22
Veredict: 1/2 are equal to the reference image!

search.sh and index_files.sh

Since percep has some configurations and ways to invoke it, it is quite simple to create a script that indexes images and looks for a reference among them.

The index_files.sh and search.sh scripts do this and can be used in a number of ways, such as:

# Index
find some_folder/  -name "*.jpg" > image_list.txt
./index_files.sh < image_list.txt 

# or ./index_files.sh image01.jpg image02.jpg ... imageN.jpg
# or ./index_files.sh images*.jpg

# Search
./search.sh image,jpg

To illustrate a real scenario, I was able to index 2.6G of my personal photos (3186 jpeg images) averaging 1.19MB each, with resolutions between 8-12MP in just 7 minutes!.

After indexing, searches are instantaneous:

$ wc -l hash_files.txt 
3186 hash_files.txt

$ time search.sh git.jpg 
575badac5abc971b  ./Photos_from_2017/P_20171114_001105.jpg

real	0m0.015s
user	0m0.013s
sys	0m0.004s

since percep needs to hash only 1 image instead of all.

Build instructions

mkdir stb/
wget https://gist.github.com/Theldus/c9c59a97aa50fe9d41433906316e6404/raw/b345a00c03a9f353ac39848a634c6e16450e7058/percep.c
wget https://raw.githubusercontent.com/nothings/stb/master/stb_image.h -O stb/stb_image.h
wget https://raw.githubusercontent.com/nothings/stb/master/stb_image_resize.h -O stb/stb_image_resize.h
gcc percep.c -o percep -lm -Wall -Wextra -std=c99 -pedantic -O3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment