Skip to content

Instantly share code, notes, and snippets.

@haseeb-heaven
Created December 1, 2024 18:55
Show Gist options
  • Save haseeb-heaven/75a50d9391bfca3e5971be8d5778a0c8 to your computer and use it in GitHub Desktop.
Save haseeb-heaven/75a50d9391bfca3e5971be8d5778a0c8 to your computer and use it in GitHub Desktop.
Advanced File Type Detection Program like GNU file type detection tool
// Advanced File Type Detection Program like GNU file type detection tool
// Note: This program is a simplified version and may not cover all possible file types.
// Created by: HeavenHM.
// Date: 2024-01-12
#include <iostream>
#include <fstream>
#include <string>
#include <filesystem>
#include <algorithm>
#include <vector>
#include <array>
#include <cctype>
#include <regex>
#include <exception>
#include <stdexcept>
#include <memory>
#include <chrono>
#include <ctime>
#include <iomanip>
#include <sstream>
namespace fs = std::filesystem;
// List of possible file types with detailed descriptions
enum class FileType {
// Text Files
ASCII_Text,
UTF8_Text,
UTF16_LE_Text,
UTF16_BE_Text,
C_Source,
Cpp_Source,
Python_Script,
Shell_Script,
HTML_Document,
XML_Document,
JSON_Document,
// Executables
PE_Executable, // Windows Portable Executable
ELF_Executable, // Linux Executable and Linkable Format
MachO_Executable_32, // macOS Mach-O 32-bit Executable
MachO_Executable_64, // macOS Mach-O 64-bit Executable
Java_Bytecode,
DOT_NET_Assembly,
// Archives and Compressed Files
Zip_Archive,
Rar_Archive,
Gzip_Compressed,
Tar_Archive,
Bzip2_Compressed,
// Media Files
JPEG_Image,
PNG_Image,
GIF_Image,
BMP_Image,
MP3_Audio,
WAV_Audio,
MP4_Video,
MKV_Video,
// Documents
PDF_Document,
MS_Word_Document,
MS_Excel_Spreadsheet,
MS_PowerPoint_Presentation,
OpenDocument_Text,
OpenDocument_Spreadsheet,
// Others
Unknown
};
// Mapping of FileType to descriptive strings
constexpr std::array<const char*, 37> fileTypeDescriptions = {
"ASCII text",
"UTF-8 Unicode text",
"UTF-16 Unicode text, Little-endian",
"UTF-16 Unicode text, Big-endian",
"C source code",
"C++ source code",
"Python script",
"Shell script",
"HTML document",
"XML document",
"JSON document",
"PE32 executable (Windows)",
"ELF executable (Linux)",
"Mach-O 32-bit executable (macOS)",
"Mach-O 64-bit executable (macOS)",
"Java bytecode",
".NET assembly",
"Zip archive data",
"RAR archive data",
"gzip compressed data",
"tar archive",
"bzip2 compressed data",
"JPEG image data",
"PNG image data",
"GIF image data",
"BMP image data",
"MP3 audio",
"WAV audio",
"MP4 video",
"Matroska video",
"PDF document",
"Microsoft Word document",
"Microsoft Excel spreadsheet",
"Microsoft PowerPoint presentation",
"OpenDocument text",
"OpenDocument spreadsheet",
"Unknown"
};
// Function to get the descriptive string of a FileType
constexpr const char* getFileTypeDescription(FileType fileType) {
int index = static_cast<int>(fileType);
if (index >= 0 && index < static_cast<int>(fileTypeDescriptions.size())) {
return fileTypeDescriptions[index];
}
return "Unknown";
}
// Structure to hold file signatures
struct Signature {
std::vector<uint8_t> data;
FileType type;
std::string description;
};
// Simple logging class
class Logger {
public:
enum class Level { INFO, WARNING, ERROR };
static Logger& instance() {
static Logger logger;
return logger;
}
void log(Level level, const std::string& message) {
std::ofstream logFile("file_type_detector.log", std::ios::app);
if (logFile) {
logFile << "[" << currentDateTime() << "] [" << levelToString(level) << "] " << message << "\n";
}
}
private:
Logger() {}
std::string currentDateTime() {
auto now = std::chrono::system_clock::now();
auto in_time_t = std::chrono::system_clock::to_time_t(now);
std::tm buf{};
localtime_r(&in_time_t, &buf);
std::ostringstream oss;
oss << std::put_time(&buf, "%Y-%m-%d %X");
return oss.str();
}
std::string levelToString(Level level) {
switch (level) {
case Level::INFO: return "INFO";
case Level::WARNING: return "WARNING";
case Level::ERROR: return "ERROR";
default: return "UNKNOWN";
}
}
};
// Class to detect file type based on signatures
class FileSignatureDetector {
public:
explicit FileSignatureDetector(const std::string& configFilePath) {
loadSignatures(configFilePath);
}
FileType detect(const std::string& filePath, std::string& description) {
size_t maxSignatureLength = getMaxSignatureLength();
auto signature = readFileSignature(filePath, maxSignatureLength);
for (const auto& sig : signatureMap) {
if (signature.size() >= sig.data.size() &&
std::equal(sig.data.begin(), sig.data.end(), signature.begin())) {
description = sig.description;
return sig.type;
}
}
description = "Unknown";
return FileType::Unknown;
}
private:
std::vector<Signature> signatureMap;
void loadSignatures(const std::string& configFilePath) {
try {
std::ifstream configFile(configFilePath);
if (!configFile) {
throw std::runtime_error("Cannot open configuration file: " + configFilePath);
}
std::string line;
while (std::getline(configFile, line)) {
if (line.empty() || line[0] == '#') {
continue; // Skip empty lines and comments
}
Signature sig;
std::istringstream iss(line);
std::string dataStr;
int typeInt;
if (!(iss >> dataStr >> typeInt)) {
throw std::runtime_error("Invalid signature format in configuration file.");
}
sig.type = static_cast<FileType>(typeInt);
sig.description = getFileTypeDescription(sig.type);
// Convert hex string to bytes
std::vector<uint8_t> dataBytes;
for (size_t i = 0; i < dataStr.length(); i += 2) {
std::string byteString = dataStr.substr(i, 2);
uint8_t byte = static_cast<uint8_t>(std::stoi(byteString, nullptr, 16));
dataBytes.push_back(byte);
}
sig.data = dataBytes;
signatureMap.push_back(sig);
}
Logger::instance().log(Logger::Level::INFO, "Loaded " + std::to_string(signatureMap.size()) + " signatures from " + configFilePath);
}
catch (const std::exception& ex) {
Logger::instance().log(Logger::Level::ERROR, "Error loading signatures: " + std::string(ex.what()));
throw;
}
}
size_t getMaxSignatureLength() const {
size_t maxLength = 0;
for (const auto& sig : signatureMap) {
if (sig.data.size() > maxLength) {
maxLength = sig.data.size();
}
}
return maxLength;
}
std::vector<uint8_t> readFileSignature(const std::string& filePath, size_t maxLength) {
std::vector<uint8_t> buffer;
try {
std::ifstream file(filePath, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file: " + filePath);
}
buffer.resize(maxLength);
file.read(reinterpret_cast<char*>(buffer.data()), maxLength);
buffer.resize(static_cast<size_t>(file.gcount()));
}
catch (const std::exception& ex) {
Logger::instance().log(Logger::Level::ERROR, "Error reading file signature: " + std::string(ex.what()));
throw;
}
return buffer;
}
};
// Class to detect text encoding
class TextEncodingDetector {
public:
FileType detect(const std::string& filePath) {
try {
std::ifstream file(filePath, std::ios::binary);
if (!file) {
throw std::runtime_error("Cannot open file: " + filePath);
}
char ch1 = 0, ch2 = 0;
file.get(ch1);
file.get(ch2);
// Check for BOM
if (static_cast<uint8_t>(ch1) == 0xFF && static_cast<uint8_t>(ch2) == 0xFE) {
return FileType::UTF16_LE_Text;
}
else if (static_cast<uint8_t>(ch1) == 0xFE && static_cast<uint8_t>(ch2) == 0xFF) {
return FileType::UTF16_BE_Text;
}
// Reset file to beginning
file.clear();
file.seekg(0, std::ios::beg);
char ch = 0;
bool isAscii = true;
bool isUtf8 = true;
while (file.get(ch)) {
uint8_t byte = static_cast<uint8_t>(ch);
if (byte > 0x7F) {
isAscii = false;
// Simple UTF-8 validation
if ((byte & 0xC0) != 0x80 && (byte & 0xE0) != 0xC0 &&
(byte & 0xF0) != 0xE0 && (byte & 0xF8) != 0xF0) {
isUtf8 = false;
break;
}
}
}
if (isAscii) {
return FileType::ASCII_Text;
}
else if (isUtf8) {
return FileType::UTF8_Text;
}
else {
return FileType::Unknown;
}
}
catch (const std::exception& ex) {
Logger::instance().log(Logger::Level::ERROR, "Error detecting text encoding: " + std::string(ex.what()));
throw;
}
}
};
// Helper function to check if a string ends with another string
bool ends_with(const std::string& value, const std::string& ending) {
if (ending.size() > value.size()) return false;
return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
}
// Function to check for script files based on first line and file extension
FileType detectScriptType(const std::string& filePath, const std::string& firstLine) {
try {
if (firstLine.find("#!") == 0) {
if (firstLine.find("python") != std::string::npos) {
return FileType::Python_Script;
}
else if (firstLine.find("sh") != std::string::npos) {
return FileType::Shell_Script;
}
}
if (ends_with(filePath, ".c")) {
return FileType::C_Source;
}
else if (ends_with(filePath, ".cpp") || ends_with(filePath, ".cc") || ends_with(filePath, ".cxx")) {
return FileType::Cpp_Source;
}
else if (ends_with(filePath, ".py")) {
return FileType::Python_Script;
}
else if (ends_with(filePath, ".sh")) {
return FileType::Shell_Script;
}
else if (ends_with(filePath, ".json")) {
return FileType::JSON_Document;
}
return FileType::Unknown;
}
catch (const std::exception& ex) {
Logger::instance().log(Logger::Level::ERROR, "Error detecting script type: " + std::string(ex.what()));
throw;
}
}
// Function to extract the original file name from a GZIP file
std::string getGzipOriginalFileName(const std::string& filePath) {
try {
std::ifstream file(filePath, std::ios::binary);
if (!file) {
Logger::instance().log(Logger::Level::WARNING, "Cannot open GZIP file: " + filePath);
return "";
}
// Read the first 10 bytes
std::array<uint8_t, 10> header{};
file.read(reinterpret_cast<char*>(header.data()), header.size());
if (header[0] != 0x1F || header[1] != 0x8B) {
Logger::instance().log(Logger::Level::WARNING, "Not a valid GZIP file: " + filePath);
return "";
}
uint8_t flags = header[3];
// Skip extra fields if present
if (flags & 0x04) {
uint16_t xlen = 0;
file.read(reinterpret_cast<char*>(&xlen), 2);
file.seekg(xlen, std::ios::cur);
}
// Read original file name if present
if (flags & 0x08) {
std::string originalName;
char ch;
while (file.get(ch) && ch != '\0') {
originalName += ch;
}
return originalName;
}
return "";
}
catch (const std::exception& ex) {
Logger::instance().log(Logger::Level::ERROR, "Error extracting GZIP original file name: " + std::string(ex.what()));
return "";
}
}
// Function to detect file type
FileType determineFileType(const std::string& filePath, std::string& description, FileSignatureDetector& signatureDetector, TextEncodingDetector& encodingDetector) {
try {
// First, check file signature
FileType fileType = signatureDetector.detect(filePath, description);
// Special handling for GZIP files to extract original file name
if (fileType == FileType::Gzip_Compressed) {
std::string originalName = getGzipOriginalFileName(filePath);
if (!originalName.empty()) {
description += ", was \"" + originalName + "\"";
}
}
if (fileType != FileType::Unknown) {
return fileType;
}
// Read first line for scripts
std::ifstream file(filePath);
if (file) {
std::string firstLine;
std::getline(file, firstLine);
FileType scriptType = detectScriptType(filePath, firstLine);
if (scriptType != FileType::Unknown) {
description = getFileTypeDescription(scriptType);
return scriptType;
}
}
// Check for text files
FileType textType = encodingDetector.detect(filePath);
if (textType != FileType::Unknown) {
description = getFileTypeDescription(textType);
return textType;
}
// If all else fails, it's unknown
description = "data";
return FileType::Unknown;
}
catch (const std::exception& ex) {
Logger::instance().log(Logger::Level::ERROR, "Error determining file type: " + std::string(ex.what()));
throw;
}
}
int main(int argc, char* argv[]) {
// Initialize logging
Logger::instance().log(Logger::Level::INFO, "File Type Detector started.");
if (argc != 2) {
Logger::instance().log(Logger::Level::ERROR, "Usage: file <file_path>");
std::cerr << "Usage: file <file_path>\n";
return 1;
}
try {
std::string filePath = argv[1];
std::string configFilePath = "signatures.cfg";
FileSignatureDetector signatureDetector(configFilePath);
TextEncodingDetector encodingDetector;
std::string description;
FileType fileType = determineFileType(filePath, description, signatureDetector, encodingDetector);
std::cout << filePath << ": " << description << "\n";
Logger::instance().log(Logger::Level::INFO, "File: " + filePath + ", Type: " + description);
}
catch (const std::exception& exception) {
Logger::instance().log(Logger::Level::ERROR, "Error: " + std::string(exception.what()));
std::cerr << "file: " << argv[1] << ": " << exception.what() << "\n";
return 1;
}
Logger::instance().log(Logger::Level::INFO, "File Type Detector finished.");
return 0;
}
@haseeb-heaven
Copy link
Author

#signatures.cfg

4D5A 11
7F454C46 12
FEEDFACE 13
FEEDFACF 14
CAFEBABE 15
504B0304 17
1F8B 19
25504446 30
FFD8FF 22
89504E47 23
47494638 24
424D 25
494433 26
52494646 27

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment