e-wahyutama · December 24, 2016 00:39 · iamarkdev · Dec 27, 2016
diff --git a/unique_byte_str.cpp b/unique_byte_str.cpp
 // erick[dot]wahyutama[at]gmail.com
 // Ery E. Wahyutama
 #include <unordered_map>
 // https://github.com/json-c/json-c
 #include <json-c/json.h>

 #ifndef byte
 typedef unsigned char byte;
 #endif

 struct key
 {
 	key(byte* _start, size_t _len)
 		: start(_start)
 		, len(_len)
 	{
 	}
 	byte* start;
 	size_t len;
 };

 // value will always allocated in the heaps
 struct value
 {
 	value(size_t _off_start, size_t _len)
 		: off_start(_off_start)
 		, len(_len)
 		, next(nullptr)
 	{
 	}
 	~value()
 	{
 		if (next)
 		{
 			delete next;
 			next = nullptr;
 		}
 	}
 	bool intersect(value* val)
 	{
 		return (off_start + len) > val->off_start;
 	}
 	size_t off_start;
 	size_t len;
 	value* next;
 };

 class cvalue
 {
 public:
 	cvalue(value* val)
 		: m_size(1)
 		, m_data(val)
 	{
 	}
 	~cvalue()
 	{
 		delete m_data;
 	}
 	cvalue(cvalue&& other)
 	{
 		m_data = other.m_data;
 		m_size = other.m_size;
 		other.m_data = nullptr;
 	}
 	// push_front only
 	void push(value* val)
 	{
 		m_size++;
 		val->next = m_data;
 		m_data = val;
 	}
 	size_t size()
 	{
 		return m_size;
 	}
 	value* top()
 	{
 		return m_data;
 	}
 private:
 	size_t m_size;
 	value* m_data;
 };
 namespace {
 	/*
 	**************************************************************************
 	*                                                                        *
 	*          General Purpose Hash Function Algorithms Library              *
 	*                                                                        *
 	* Author: Arash Partow - 2002                                            *
 	* URL: http://www.partow.net                                             *
 	* URL: http://www.partow.net/programming/hashfunctions/index.html        *
 	*                                                                        *
 	* Copyright notice:                                                      *
 	* Free use of the General Purpose Hash Function Algorithms Library is    *
 	* permitted under the guidelines and in accordance with the most current *
 	* version of the Common Public License.                                  *
 	* http://www.opensource.org/licenses/cpl1.0.php                          *
 	*                                                                        *
 	**************************************************************************
 	*/
 	unsigned int APHash(char* str, unsigned int len)
 	{
 		unsigned int hash = 0xAAAAAAAA;
 		unsigned int i = 0;

 		for (i = 0; i < len; str++, i++)
 		{
 			hash ^= ((i & 1) == 0) ? ((hash << 7) ^ (*str) * (hash >> 3)) :
 				(~((hash << 11) + ((*str) ^ (hash >> 5))));
 		}

 		return hash;
 	}
 }
 struct hash_key
 {
 	size_t operator()(const key& _key) const
 	{
 		return APHash((char*)_key.start, _key.len);
 	}
 };

 struct equal_key
 {
 	std::size_t operator()(const key& key1, const key& key2) const
 	{
 		return ((key1.len == key2.len)
 			&& memcmp(key1.start, key2.start, key1.len) == 0);
 	}
 };

 typedef std::unordered_map<key, cvalue, hash_key, equal_key> key_value_map;

 class byte_str
 {
 public:
 	byte_str(byte* buffer, size_t len)
 		: m_buffer(buffer)
 		, m_len(len)
 	{
 		memset(m_vals, 0, sizeof(m_vals));
 		m_maxSubLen = m_len / 2 + ((m_len & 1) ? 1 : 0);
 	}
 	~byte_str()
 	{
 		if (m_json)
 			json_object_put(m_json);
 	}
 	json_object* process()
 	{
 		if (m_maps.empty())
 		{
 			m_json = json_object_new_object();
 			_process_single_byte();
 			_process_bytes();
 		}
 		return m_json;
 	}

 private:
 	void _process_single_byte()
 	{
 		for (size_t indeks = 0; indeks < m_len; ++indeks)
 			m_vals[m_buffer[indeks]] += 1;

 		for (size_t indeks = 0; indeks < m_len; ++indeks)
 		{
 			if (m_vals[indeks] != 0)
 			{
 				if (m_vals[indeks] > 1)
 				{
 					char buffer[2] = {};
 					buffer[0] = indeks;
 					json_object_object_add(m_json, buffer, json_object_new_int(m_vals[indeks]));
 				}
 			}
 		}
 	}
 	void _process_bytes()
 	{
 		// for every possible iteration
 		// it is impossible to have repeatable substring with sub len > max_len/2
 		// so discard it
 		for (size_t iter = 1; iter <= m_maxSubLen; ++iter)
 		{
 			auto sub_len = iter + 1;
 			// for every possible substring
 			// stop if start (pos + len) exceeded sub_len
 			// do the first check really necessary?
 			for (size_t indeks = 0;
 				(indeks < m_len) && (m_len - indeks >= sub_len);
 				++indeks)
 			{
 				auto end = indeks + sub_len;
 				// start from 2 chars as 1 char already processed;
 				// check for every indeks_start -> end inside buffer
 				for (int indeks_start = indeks; indeks_start < end; ++indeks_start)
 				{
 					auto found = m_maps.find({ &m_buffer[indeks_start], sub_len });
 					if (found == m_maps.end())
 						m_maps.insert(std::make_pair(key(&m_buffer[indeks_start], sub_len), cvalue(new value(indeks_start, sub_len))));
 					else
 					{
 						// as long as it is not intersect with previous value
 						// for example with this string (len == 3)
 						// 0xFF 0FF 0xFF 0xFF ...
 						// 1st: 0xFF 0FF 0xFF (0xFF)
 						// 2nd: (0xFF) 0xFF 0FF 0xFF (intersected, hence not valid sequence)
 						// but I believe this case is (might be) rare.
 						if (!(found->second.top()->intersect(&value(indeks_start, sub_len))))
 							found->second.push(new value(indeks_start, sub_len));
 					}

 				}
 			}
 		}
 		for (auto& item : m_maps)
 		{
 			if (item.second.size() > 1)
 			{
 				char* buffer = new char[item.first.len + 1];
 				buffer[item.first.len] = '\0';
 				memcpy(buffer, item.first.start, item.first.len);
 				json_object_object_add(m_json, buffer, json_object_new_int(item.second.size()));
 				delete[] buffer;
 			}
 		}
 	}
 	key_value_map m_maps;
 	size_t m_vals[256]; // hold values
 	byte* m_buffer;
 	size_t m_len;
 	size_t m_maxSubLen;
 	json_object* m_json;
 };

 int main()
 {
 	byte val[] = "this this a test";
 	byte_str bytestr(val, strlen((char*)val));
 	auto json = bytestr.process();
 	printf(json_object_to_json_string(json));
 }
	// erick[dot]wahyutama[at]gmail.com
	// Ery E. Wahyutama
	#include <unordered_map>
	// https://github.com/json-c/json-c
	#include <json-c/json.h>

	#ifndef byte
	typedef unsigned char byte;
	#endif

	struct key
	{
	key(byte* _start, size_t _len)
	: start(_start)
	, len(_len)
	{
	}
	byte* start;
	size_t len;
	};

	// value will always allocated in the heaps
	struct value
	{
	value(size_t _off_start, size_t _len)
	: off_start(_off_start)
	, len(_len)
	, next(nullptr)
	{
	}
	~value()
	{
	if (next)
	{
	delete next;
	next = nullptr;
	}
	}
	bool intersect(value* val)
	{
	return (off_start + len) > val->off_start;
	}
	size_t off_start;
	size_t len;
	value* next;
	};

	class cvalue
	{
	public:
	cvalue(value* val)
	: m_size(1)
	, m_data(val)
	{
	}
	~cvalue()
	{
	delete m_data;
	}
	cvalue(cvalue&& other)
	{
	m_data = other.m_data;
	m_size = other.m_size;
	other.m_data = nullptr;
	}
	// push_front only
	void push(value* val)
	{
	m_size++;
	val->next = m_data;
	m_data = val;
	}
	size_t size()
	{
	return m_size;
	}
	value* top()
	{
	return m_data;
	}
	private:
	size_t m_size;
	value* m_data;
	};
	namespace {
	/*
	**************************************************************************
	* *
	* General Purpose Hash Function Algorithms Library *
	* *
	* Author: Arash Partow - 2002 *
	* URL: http://www.partow.net *
	* URL: http://www.partow.net/programming/hashfunctions/index.html *
	* *
	* Copyright notice: *
	* Free use of the General Purpose Hash Function Algorithms Library is *
	* permitted under the guidelines and in accordance with the most current *
	* version of the Common Public License. *
	* http://www.opensource.org/licenses/cpl1.0.php *
	* *
	**************************************************************************
	*/
	unsigned int APHash(char* str, unsigned int len)
	{
	unsigned int hash = 0xAAAAAAAA;
	unsigned int i = 0;

	for (i = 0; i < len; str++, i++)
	{
	hash ^= ((i & 1) == 0) ? ((hash << 7) ^ (str) (hash >> 3)) :
	(~((hash << 11) + ((*str) ^ (hash >> 5))));
	}

	return hash;
	}
	}
	struct hash_key
	{
	size_t operator()(const key& _key) const
	{
	return APHash((char*)_key.start, _key.len);
	}
	};

	struct equal_key
	{
	std::size_t operator()(const key& key1, const key& key2) const
	{
	return ((key1.len == key2.len)
	&& memcmp(key1.start, key2.start, key1.len) == 0);
	}
	};

	typedef std::unordered_map<key, cvalue, hash_key, equal_key> key_value_map;

	class byte_str
	{
	public:
	byte_str(byte* buffer, size_t len)
	: m_buffer(buffer)
	, m_len(len)
	{
	memset(m_vals, 0, sizeof(m_vals));
	m_maxSubLen = m_len / 2 + ((m_len & 1) ? 1 : 0);
	}
	~byte_str()
	{
	if (m_json)
	json_object_put(m_json);
	}
	json_object* process()
	{
	if (m_maps.empty())
	{
	m_json = json_object_new_object();
	_process_single_byte();
	_process_bytes();
	}
	return m_json;
	}

	private:
	void _process_single_byte()
	{
	for (size_t indeks = 0; indeks < m_len; ++indeks)
	m_vals[m_buffer[indeks]] += 1;

	for (size_t indeks = 0; indeks < m_len; ++indeks)
	{
	if (m_vals[indeks] != 0)
	{
	if (m_vals[indeks] > 1)
	{
	char buffer[2] = {};
	buffer[0] = indeks;
	json_object_object_add(m_json, buffer, json_object_new_int(m_vals[indeks]));
	}
	}
	}
	}
	void _process_bytes()
	{
	// for every possible iteration
	// it is impossible to have repeatable substring with sub len > max_len/2
	// so discard it
	for (size_t iter = 1; iter <= m_maxSubLen; ++iter)
	{
	auto sub_len = iter + 1;
	// for every possible substring
	// stop if start (pos + len) exceeded sub_len
	// do the first check really necessary?
	for (size_t indeks = 0;
	(indeks < m_len) && (m_len - indeks >= sub_len);
	++indeks)
	{
	auto end = indeks + sub_len;
	// start from 2 chars as 1 char already processed;
	// check for every indeks_start -> end inside buffer
	for (int indeks_start = indeks; indeks_start < end; ++indeks_start)
	{
	auto found = m_maps.find({ &m_buffer[indeks_start], sub_len });
	if (found == m_maps.end())
	m_maps.insert(std::make_pair(key(&m_buffer[indeks_start], sub_len), cvalue(new value(indeks_start, sub_len))));
	else
	{
	// as long as it is not intersect with previous value
	// for example with this string (len == 3)
	// 0xFF 0FF 0xFF 0xFF ...
	// 1st: 0xFF 0FF 0xFF (0xFF)
	// 2nd: (0xFF) 0xFF 0FF 0xFF (intersected, hence not valid sequence)
	// but I believe this case is (might be) rare.
	if (!(found->second.top()->intersect(&value(indeks_start, sub_len))))
	found->second.push(new value(indeks_start, sub_len));
	}

	}
	}
	}
	for (auto& item : m_maps)
	{
	if (item.second.size() > 1)
	{
	char* buffer = new char[item.first.len + 1];
	buffer[item.first.len] = '\0';
	memcpy(buffer, item.first.start, item.first.len);
	json_object_object_add(m_json, buffer, json_object_new_int(item.second.size()));
	delete[] buffer;
	}
	}
	}
	key_value_map m_maps;
	size_t m_vals[256]; // hold values
	byte* m_buffer;
	size_t m_len;
	size_t m_maxSubLen;
	json_object* m_json;
	};

	int main()
	{
	byte val[] = "this this a test";
	byte_str bytestr(val, strlen((char*)val));
	auto json = bytestr.process();
	printf(json_object_to_json_string(json));
	}