Created
December 1, 2024 18:55
-
-
Save haseeb-heaven/75a50d9391bfca3e5971be8d5778a0c8 to your computer and use it in GitHub Desktop.
Advanced File Type Detection Program like GNU file type detection tool
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Advanced File Type Detection Program like GNU file type detection tool | |
// Note: This program is a simplified version and may not cover all possible file types. | |
// Created by: HeavenHM. | |
// Date: 2024-01-12 | |
#include <iostream> | |
#include <fstream> | |
#include <string> | |
#include <filesystem> | |
#include <algorithm> | |
#include <vector> | |
#include <array> | |
#include <cctype> | |
#include <regex> | |
#include <exception> | |
#include <stdexcept> | |
#include <memory> | |
#include <chrono> | |
#include <ctime> | |
#include <iomanip> | |
#include <sstream> | |
namespace fs = std::filesystem; | |
// List of possible file types with detailed descriptions | |
enum class FileType { | |
// Text Files | |
ASCII_Text, | |
UTF8_Text, | |
UTF16_LE_Text, | |
UTF16_BE_Text, | |
C_Source, | |
Cpp_Source, | |
Python_Script, | |
Shell_Script, | |
HTML_Document, | |
XML_Document, | |
JSON_Document, | |
// Executables | |
PE_Executable, // Windows Portable Executable | |
ELF_Executable, // Linux Executable and Linkable Format | |
MachO_Executable_32, // macOS Mach-O 32-bit Executable | |
MachO_Executable_64, // macOS Mach-O 64-bit Executable | |
Java_Bytecode, | |
DOT_NET_Assembly, | |
// Archives and Compressed Files | |
Zip_Archive, | |
Rar_Archive, | |
Gzip_Compressed, | |
Tar_Archive, | |
Bzip2_Compressed, | |
// Media Files | |
JPEG_Image, | |
PNG_Image, | |
GIF_Image, | |
BMP_Image, | |
MP3_Audio, | |
WAV_Audio, | |
MP4_Video, | |
MKV_Video, | |
// Documents | |
PDF_Document, | |
MS_Word_Document, | |
MS_Excel_Spreadsheet, | |
MS_PowerPoint_Presentation, | |
OpenDocument_Text, | |
OpenDocument_Spreadsheet, | |
// Others | |
Unknown | |
}; | |
// Mapping of FileType to descriptive strings | |
constexpr std::array<const char*, 37> fileTypeDescriptions = { | |
"ASCII text", | |
"UTF-8 Unicode text", | |
"UTF-16 Unicode text, Little-endian", | |
"UTF-16 Unicode text, Big-endian", | |
"C source code", | |
"C++ source code", | |
"Python script", | |
"Shell script", | |
"HTML document", | |
"XML document", | |
"JSON document", | |
"PE32 executable (Windows)", | |
"ELF executable (Linux)", | |
"Mach-O 32-bit executable (macOS)", | |
"Mach-O 64-bit executable (macOS)", | |
"Java bytecode", | |
".NET assembly", | |
"Zip archive data", | |
"RAR archive data", | |
"gzip compressed data", | |
"tar archive", | |
"bzip2 compressed data", | |
"JPEG image data", | |
"PNG image data", | |
"GIF image data", | |
"BMP image data", | |
"MP3 audio", | |
"WAV audio", | |
"MP4 video", | |
"Matroska video", | |
"PDF document", | |
"Microsoft Word document", | |
"Microsoft Excel spreadsheet", | |
"Microsoft PowerPoint presentation", | |
"OpenDocument text", | |
"OpenDocument spreadsheet", | |
"Unknown" | |
}; | |
// Function to get the descriptive string of a FileType | |
constexpr const char* getFileTypeDescription(FileType fileType) { | |
int index = static_cast<int>(fileType); | |
if (index >= 0 && index < static_cast<int>(fileTypeDescriptions.size())) { | |
return fileTypeDescriptions[index]; | |
} | |
return "Unknown"; | |
} | |
// Structure to hold file signatures | |
struct Signature { | |
std::vector<uint8_t> data; | |
FileType type; | |
std::string description; | |
}; | |
// Simple logging class | |
class Logger { | |
public: | |
enum class Level { INFO, WARNING, ERROR }; | |
static Logger& instance() { | |
static Logger logger; | |
return logger; | |
} | |
void log(Level level, const std::string& message) { | |
std::ofstream logFile("file_type_detector.log", std::ios::app); | |
if (logFile) { | |
logFile << "[" << currentDateTime() << "] [" << levelToString(level) << "] " << message << "\n"; | |
} | |
} | |
private: | |
Logger() {} | |
std::string currentDateTime() { | |
auto now = std::chrono::system_clock::now(); | |
auto in_time_t = std::chrono::system_clock::to_time_t(now); | |
std::tm buf{}; | |
localtime_r(&in_time_t, &buf); | |
std::ostringstream oss; | |
oss << std::put_time(&buf, "%Y-%m-%d %X"); | |
return oss.str(); | |
} | |
std::string levelToString(Level level) { | |
switch (level) { | |
case Level::INFO: return "INFO"; | |
case Level::WARNING: return "WARNING"; | |
case Level::ERROR: return "ERROR"; | |
default: return "UNKNOWN"; | |
} | |
} | |
}; | |
// Class to detect file type based on signatures | |
class FileSignatureDetector { | |
public: | |
explicit FileSignatureDetector(const std::string& configFilePath) { | |
loadSignatures(configFilePath); | |
} | |
FileType detect(const std::string& filePath, std::string& description) { | |
size_t maxSignatureLength = getMaxSignatureLength(); | |
auto signature = readFileSignature(filePath, maxSignatureLength); | |
for (const auto& sig : signatureMap) { | |
if (signature.size() >= sig.data.size() && | |
std::equal(sig.data.begin(), sig.data.end(), signature.begin())) { | |
description = sig.description; | |
return sig.type; | |
} | |
} | |
description = "Unknown"; | |
return FileType::Unknown; | |
} | |
private: | |
std::vector<Signature> signatureMap; | |
void loadSignatures(const std::string& configFilePath) { | |
try { | |
std::ifstream configFile(configFilePath); | |
if (!configFile) { | |
throw std::runtime_error("Cannot open configuration file: " + configFilePath); | |
} | |
std::string line; | |
while (std::getline(configFile, line)) { | |
if (line.empty() || line[0] == '#') { | |
continue; // Skip empty lines and comments | |
} | |
Signature sig; | |
std::istringstream iss(line); | |
std::string dataStr; | |
int typeInt; | |
if (!(iss >> dataStr >> typeInt)) { | |
throw std::runtime_error("Invalid signature format in configuration file."); | |
} | |
sig.type = static_cast<FileType>(typeInt); | |
sig.description = getFileTypeDescription(sig.type); | |
// Convert hex string to bytes | |
std::vector<uint8_t> dataBytes; | |
for (size_t i = 0; i < dataStr.length(); i += 2) { | |
std::string byteString = dataStr.substr(i, 2); | |
uint8_t byte = static_cast<uint8_t>(std::stoi(byteString, nullptr, 16)); | |
dataBytes.push_back(byte); | |
} | |
sig.data = dataBytes; | |
signatureMap.push_back(sig); | |
} | |
Logger::instance().log(Logger::Level::INFO, "Loaded " + std::to_string(signatureMap.size()) + " signatures from " + configFilePath); | |
} | |
catch (const std::exception& ex) { | |
Logger::instance().log(Logger::Level::ERROR, "Error loading signatures: " + std::string(ex.what())); | |
throw; | |
} | |
} | |
size_t getMaxSignatureLength() const { | |
size_t maxLength = 0; | |
for (const auto& sig : signatureMap) { | |
if (sig.data.size() > maxLength) { | |
maxLength = sig.data.size(); | |
} | |
} | |
return maxLength; | |
} | |
std::vector<uint8_t> readFileSignature(const std::string& filePath, size_t maxLength) { | |
std::vector<uint8_t> buffer; | |
try { | |
std::ifstream file(filePath, std::ios::binary); | |
if (!file) { | |
throw std::runtime_error("Cannot open file: " + filePath); | |
} | |
buffer.resize(maxLength); | |
file.read(reinterpret_cast<char*>(buffer.data()), maxLength); | |
buffer.resize(static_cast<size_t>(file.gcount())); | |
} | |
catch (const std::exception& ex) { | |
Logger::instance().log(Logger::Level::ERROR, "Error reading file signature: " + std::string(ex.what())); | |
throw; | |
} | |
return buffer; | |
} | |
}; | |
// Class to detect text encoding | |
class TextEncodingDetector { | |
public: | |
FileType detect(const std::string& filePath) { | |
try { | |
std::ifstream file(filePath, std::ios::binary); | |
if (!file) { | |
throw std::runtime_error("Cannot open file: " + filePath); | |
} | |
char ch1 = 0, ch2 = 0; | |
file.get(ch1); | |
file.get(ch2); | |
// Check for BOM | |
if (static_cast<uint8_t>(ch1) == 0xFF && static_cast<uint8_t>(ch2) == 0xFE) { | |
return FileType::UTF16_LE_Text; | |
} | |
else if (static_cast<uint8_t>(ch1) == 0xFE && static_cast<uint8_t>(ch2) == 0xFF) { | |
return FileType::UTF16_BE_Text; | |
} | |
// Reset file to beginning | |
file.clear(); | |
file.seekg(0, std::ios::beg); | |
char ch = 0; | |
bool isAscii = true; | |
bool isUtf8 = true; | |
while (file.get(ch)) { | |
uint8_t byte = static_cast<uint8_t>(ch); | |
if (byte > 0x7F) { | |
isAscii = false; | |
// Simple UTF-8 validation | |
if ((byte & 0xC0) != 0x80 && (byte & 0xE0) != 0xC0 && | |
(byte & 0xF0) != 0xE0 && (byte & 0xF8) != 0xF0) { | |
isUtf8 = false; | |
break; | |
} | |
} | |
} | |
if (isAscii) { | |
return FileType::ASCII_Text; | |
} | |
else if (isUtf8) { | |
return FileType::UTF8_Text; | |
} | |
else { | |
return FileType::Unknown; | |
} | |
} | |
catch (const std::exception& ex) { | |
Logger::instance().log(Logger::Level::ERROR, "Error detecting text encoding: " + std::string(ex.what())); | |
throw; | |
} | |
} | |
}; | |
// Helper function to check if a string ends with another string | |
bool ends_with(const std::string& value, const std::string& ending) { | |
if (ending.size() > value.size()) return false; | |
return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); | |
} | |
// Function to check for script files based on first line and file extension | |
FileType detectScriptType(const std::string& filePath, const std::string& firstLine) { | |
try { | |
if (firstLine.find("#!") == 0) { | |
if (firstLine.find("python") != std::string::npos) { | |
return FileType::Python_Script; | |
} | |
else if (firstLine.find("sh") != std::string::npos) { | |
return FileType::Shell_Script; | |
} | |
} | |
if (ends_with(filePath, ".c")) { | |
return FileType::C_Source; | |
} | |
else if (ends_with(filePath, ".cpp") || ends_with(filePath, ".cc") || ends_with(filePath, ".cxx")) { | |
return FileType::Cpp_Source; | |
} | |
else if (ends_with(filePath, ".py")) { | |
return FileType::Python_Script; | |
} | |
else if (ends_with(filePath, ".sh")) { | |
return FileType::Shell_Script; | |
} | |
else if (ends_with(filePath, ".json")) { | |
return FileType::JSON_Document; | |
} | |
return FileType::Unknown; | |
} | |
catch (const std::exception& ex) { | |
Logger::instance().log(Logger::Level::ERROR, "Error detecting script type: " + std::string(ex.what())); | |
throw; | |
} | |
} | |
// Function to extract the original file name from a GZIP file | |
std::string getGzipOriginalFileName(const std::string& filePath) { | |
try { | |
std::ifstream file(filePath, std::ios::binary); | |
if (!file) { | |
Logger::instance().log(Logger::Level::WARNING, "Cannot open GZIP file: " + filePath); | |
return ""; | |
} | |
// Read the first 10 bytes | |
std::array<uint8_t, 10> header{}; | |
file.read(reinterpret_cast<char*>(header.data()), header.size()); | |
if (header[0] != 0x1F || header[1] != 0x8B) { | |
Logger::instance().log(Logger::Level::WARNING, "Not a valid GZIP file: " + filePath); | |
return ""; | |
} | |
uint8_t flags = header[3]; | |
// Skip extra fields if present | |
if (flags & 0x04) { | |
uint16_t xlen = 0; | |
file.read(reinterpret_cast<char*>(&xlen), 2); | |
file.seekg(xlen, std::ios::cur); | |
} | |
// Read original file name if present | |
if (flags & 0x08) { | |
std::string originalName; | |
char ch; | |
while (file.get(ch) && ch != '\0') { | |
originalName += ch; | |
} | |
return originalName; | |
} | |
return ""; | |
} | |
catch (const std::exception& ex) { | |
Logger::instance().log(Logger::Level::ERROR, "Error extracting GZIP original file name: " + std::string(ex.what())); | |
return ""; | |
} | |
} | |
// Function to detect file type | |
FileType determineFileType(const std::string& filePath, std::string& description, FileSignatureDetector& signatureDetector, TextEncodingDetector& encodingDetector) { | |
try { | |
// First, check file signature | |
FileType fileType = signatureDetector.detect(filePath, description); | |
// Special handling for GZIP files to extract original file name | |
if (fileType == FileType::Gzip_Compressed) { | |
std::string originalName = getGzipOriginalFileName(filePath); | |
if (!originalName.empty()) { | |
description += ", was \"" + originalName + "\""; | |
} | |
} | |
if (fileType != FileType::Unknown) { | |
return fileType; | |
} | |
// Read first line for scripts | |
std::ifstream file(filePath); | |
if (file) { | |
std::string firstLine; | |
std::getline(file, firstLine); | |
FileType scriptType = detectScriptType(filePath, firstLine); | |
if (scriptType != FileType::Unknown) { | |
description = getFileTypeDescription(scriptType); | |
return scriptType; | |
} | |
} | |
// Check for text files | |
FileType textType = encodingDetector.detect(filePath); | |
if (textType != FileType::Unknown) { | |
description = getFileTypeDescription(textType); | |
return textType; | |
} | |
// If all else fails, it's unknown | |
description = "data"; | |
return FileType::Unknown; | |
} | |
catch (const std::exception& ex) { | |
Logger::instance().log(Logger::Level::ERROR, "Error determining file type: " + std::string(ex.what())); | |
throw; | |
} | |
} | |
int main(int argc, char* argv[]) { | |
// Initialize logging | |
Logger::instance().log(Logger::Level::INFO, "File Type Detector started."); | |
if (argc != 2) { | |
Logger::instance().log(Logger::Level::ERROR, "Usage: file <file_path>"); | |
std::cerr << "Usage: file <file_path>\n"; | |
return 1; | |
} | |
try { | |
std::string filePath = argv[1]; | |
std::string configFilePath = "signatures.cfg"; | |
FileSignatureDetector signatureDetector(configFilePath); | |
TextEncodingDetector encodingDetector; | |
std::string description; | |
FileType fileType = determineFileType(filePath, description, signatureDetector, encodingDetector); | |
std::cout << filePath << ": " << description << "\n"; | |
Logger::instance().log(Logger::Level::INFO, "File: " + filePath + ", Type: " + description); | |
} | |
catch (const std::exception& exception) { | |
Logger::instance().log(Logger::Level::ERROR, "Error: " + std::string(exception.what())); | |
std::cerr << "file: " << argv[1] << ": " << exception.what() << "\n"; | |
return 1; | |
} | |
Logger::instance().log(Logger::Level::INFO, "File Type Detector finished."); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
#signatures.cfg
4D5A 11
7F454C46 12
FEEDFACE 13
FEEDFACF 14
CAFEBABE 15
504B0304 17
1F8B 19
25504446 30
FFD8FF 22
89504E47 23
47494638 24
424D 25
494433 26
52494646 27