Skip to content

Instantly share code, notes, and snippets.

@7etsuo
Created March 17, 2025 02:50
Show Gist options
  • Save 7etsuo/5fd58211c3dafdda8f3136f2d2eb6ded to your computer and use it in GitHub Desktop.
Save 7etsuo/5fd58211c3dafdda8f3136f2d2eb6ded to your computer and use it in GitHub Desktop.
/** x.com/7etsuo
*
* compile with gcc -o pre_process hotdog_preprocessor.c -lgd -lm
* usage: ./pre_process dataset/train/hotdog/ dataset/train/nothotdog/
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <time.h>
#include <gd.h>
#define TARGET_WIDTH 224
#define TARGET_HEIGHT 224
#define CHANNELS 3
// Structure to hold image data and label
#pragma pack(push, 1)
typedef struct
{
float *data;
int label; // 1 for hotdog, 0 for not-hotdog
} ImageSample;
#pragma pack(pop)
int
preprocess_image (const char *input_path, float *output_buffer,
int target_width, int target_height)
{
FILE *in;
gdImagePtr img = NULL, resized = NULL;
int x, y, color;
unsigned char r, g, b;
// Open the image file
in = fopen (input_path, "rb");
if (!in)
{
fprintf (stderr, "Could not open image file %s\n", input_path);
return -1;
}
// Detect image type and load accordingly
if (strstr (input_path, ".jpg") || strstr (input_path, ".jpeg"))
{
img = gdImageCreateFromJpeg (in);
}
else if (strstr (input_path, ".png"))
{
img = gdImageCreateFromPng (in);
}
else if (strstr (input_path, ".gif"))
{
img = gdImageCreateFromGif (in);
}
else if (strstr (input_path, ".bmp"))
{
img = gdImageCreateFromBmp (in);
}
else
{
fprintf (stderr, "Unsupported image format for %s\n", input_path);
fclose (in);
return -1;
}
fclose (in);
if (!img)
{
fprintf (stderr, "Failed to load image %s\n", input_path);
return -1;
}
// Create a new image with the target dimensions
resized = gdImageCreateTrueColor (target_width, target_height);
if (!resized)
{
fprintf (stderr, "Failed to create resized image buffer\n");
gdImageDestroy (img);
return -1;
}
// Resize the image
gdImageCopyResampled (resized, img, 0, 0, 0, 0, target_width, target_height,
gdImageSX (img), gdImageSY (img));
// Normalize pixel values to [0,1] and store in output_buffer
for (y = 0; y < target_height; y++)
{
for (x = 0; x < target_width; x++)
{
color = gdImageGetPixel (resized, x, y);
// Extract RGB components
r = gdImageRed (resized, color);
g = gdImageGreen (resized, color);
b = gdImageBlue (resized, color);
// Store normalized RGB values in output_buffer
int idx = (y * target_width + x) * CHANNELS;
output_buffer[idx] = r / 255.0f;
output_buffer[idx + 1] = g / 255.0f;
output_buffer[idx + 2] = b / 255.0f;
}
}
// Clean up
gdImageDestroy (img);
gdImageDestroy (resized);
return 0;
}
// Process all images in a directory
int
process_directory (const char *dir_path, int label, ImageSample **samples,
int *count)
{
DIR *dir;
struct dirent *entry;
char filepath[1024];
int img_count = 0;
// Open directory
dir = opendir (dir_path);
if (!dir)
{
fprintf (stderr, "Could not open directory %s\n", dir_path);
return -1;
}
// First count the number of image files
while ((entry = readdir (dir)) != NULL)
{
// Skip . and .. directories
if (strcmp (entry->d_name, ".") == 0 || strcmp (entry->d_name, "..") == 0)
continue;
// Check for common image extensions
if (strstr (entry->d_name, ".jpg") || strstr (entry->d_name, ".jpeg")
|| strstr (entry->d_name, ".png") || strstr (entry->d_name, ".gif")
|| strstr (entry->d_name, ".bmp"))
{
img_count++;
}
}
// Rewind directory
rewinddir (dir);
// Allocate memory for the samples
*samples = (ImageSample *) malloc (img_count * sizeof (ImageSample));
if (!*samples)
{
fprintf (stderr, "Memory allocation failed\n");
closedir (dir);
return -1;
}
// Process each image file
int idx = 0;
while ((entry = readdir (dir)) != NULL)
{
// Skip . and .. directories
if (strcmp (entry->d_name, ".") == 0 || strcmp (entry->d_name, "..") == 0)
continue;
// Check for common image extensions
if (strstr (entry->d_name, ".jpg") || strstr (entry->d_name, ".jpeg")
|| strstr (entry->d_name, ".png") || strstr (entry->d_name, ".gif")
|| strstr (entry->d_name, ".bmp"))
{
// Construct full file path
snprintf (filepath, sizeof (filepath), "%s/%s", dir_path, entry->d_name);
// Allocate memory for image data
float *img_data = (float *) malloc (TARGET_WIDTH * TARGET_HEIGHT
* CHANNELS * sizeof (float));
if (!img_data)
{
fprintf (stderr, "Memory allocation failed for image data\n");
continue;
}
// Process the image
if (preprocess_image (filepath, img_data, TARGET_WIDTH, TARGET_HEIGHT)
== 0)
{
(*samples)[idx].data = img_data;
(*samples)[idx].label = label;
idx++;
printf ("Processed %s\n", filepath);
}
else
{
free (img_data);
}
}
}
*count = idx;
closedir (dir);
return 0;
}
// Fisher-Yates shuffle algorithm to randomly shuffle the samples
void
shuffle_samples (ImageSample *samples, int count)
{
srand (time (NULL));
for (int i = count - 1; i > 0; i--)
{
int j = rand () % (i + 1);
// Swap samples[i] and samples[j]
ImageSample temp = samples[i];
samples[i] = samples[j];
samples[j] = temp;
}
}
// Save processed dataset to a binary file
void
save_dataset (const char *filename, ImageSample *samples, int count)
{
FILE *file = fopen (filename, "wb");
if (!file)
{
fprintf (stderr, "Could not open file %s for writing\n", filename);
return;
}
// Write number of samples
fwrite (&count, sizeof (int), 1, file);
// Write image dimensions
int dimensions[3] = {TARGET_WIDTH, TARGET_HEIGHT, CHANNELS};
fwrite (dimensions, sizeof (int), 3, file);
// Write each sample
for (int i = 0; i < count; i++)
{
// Write label
fwrite (&samples[i].label, sizeof (int), 1, file);
// Write image data
int data_size = TARGET_WIDTH * TARGET_HEIGHT * CHANNELS;
fwrite (samples[i].data, sizeof (float), data_size, file);
}
fclose (file);
printf ("Saved %d samples to %s\n", count, filename);
}
int
main (int argc, char *argv[])
{
if (argc != 3)
{
fprintf (stderr, "Usage: %s <hotdog_folder> <nothotdog_folder>\n", argv[0]);
return 1;
}
const char *hotdog_dir = argv[1];
const char *nothotdog_dir = argv[2];
ImageSample *hotdog_samples = NULL;
ImageSample *nothotdog_samples = NULL;
int hotdog_count = 0;
int nothotdog_count = 0;
// Process hotdog directory (label 1)
if (process_directory (hotdog_dir, 1, &hotdog_samples, &hotdog_count) != 0)
{
fprintf (stderr, "Failed to process hotdog directory\n");
return 1;
}
// Process not-hotdog directory (label 0)
if (process_directory (nothotdog_dir, 0, &nothotdog_samples, &nothotdog_count)
!= 0)
{
fprintf (stderr, "Failed to process not-hotdog directory\n");
// Free hotdog samples
for (int i = 0; i < hotdog_count; i++)
{
free (hotdog_samples[i].data);
}
free (hotdog_samples);
return 1;
}
// Combine both datasets
int total_count = hotdog_count + nothotdog_count;
ImageSample *all_samples
= (ImageSample *) malloc (total_count * sizeof (ImageSample));
if (!all_samples)
{
fprintf (stderr, "Memory allocation failed for combined dataset\n");
return 1;
}
// Copy samples to combined array
memcpy (all_samples, hotdog_samples, hotdog_count * sizeof (ImageSample));
memcpy (all_samples + hotdog_count, nothotdog_samples,
nothotdog_count * sizeof (ImageSample));
// Shuffle the combined dataset
shuffle_samples (all_samples, total_count);
// Split into training (80%) and test (20%) sets
int train_count = (int) (total_count * 0.8);
int test_count = total_count - train_count;
printf ("Total images: %d (Hotdogs: %d, Not-hotdogs: %d)\n", total_count,
hotdog_count, nothotdog_count);
printf ("Training set: %d images\n", train_count);
printf ("Test set: %d images\n", test_count);
// Save training and test datasets
save_dataset ("train_data.bin", all_samples, train_count);
save_dataset ("test_data.bin", all_samples + train_count, test_count);
// Free memory
for (int i = 0; i < total_count; i++)
{
free (all_samples[i].data);
}
free (all_samples);
free (hotdog_samples);
free (nothotdog_samples);
printf ("Preprocessing complete!\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment