Skip to content

Instantly share code, notes, and snippets.

@e-wahyutama
Created December 24, 2016 00:39
Show Gist options
  • Save e-wahyutama/5f30e0fd0224701ee7664777fc96ba26 to your computer and use it in GitHub Desktop.
Save e-wahyutama/5f30e0fd0224701ee7664777fc96ba26 to your computer and use it in GitHub Desktop.
// erick[dot]wahyutama[at]gmail.com
// Ery E. Wahyutama
#include <unordered_map>
// https://github.com/json-c/json-c
#include <json-c/json.h>
#ifndef byte
typedef unsigned char byte;
#endif
struct key
{
key(byte* _start, size_t _len)
: start(_start)
, len(_len)
{
}
byte* start;
size_t len;
};
// value will always allocated in the heaps
struct value
{
value(size_t _off_start, size_t _len)
: off_start(_off_start)
, len(_len)
, next(nullptr)
{
}
~value()
{
if (next)
{
delete next;
next = nullptr;
}
}
bool intersect(value* val)
{
return (off_start + len) > val->off_start;
}
size_t off_start;
size_t len;
value* next;
};
class cvalue
{
public:
cvalue(value* val)
: m_size(1)
, m_data(val)
{
}
~cvalue()
{
delete m_data;
}
cvalue(cvalue&& other)
{
m_data = other.m_data;
m_size = other.m_size;
other.m_data = nullptr;
}
// push_front only
void push(value* val)
{
m_size++;
val->next = m_data;
m_data = val;
}
size_t size()
{
return m_size;
}
value* top()
{
return m_data;
}
private:
size_t m_size;
value* m_data;
};
namespace {
/*
**************************************************************************
* *
* General Purpose Hash Function Algorithms Library *
* *
* Author: Arash Partow - 2002 *
* URL: http://www.partow.net *
* URL: http://www.partow.net/programming/hashfunctions/index.html *
* *
* Copyright notice: *
* Free use of the General Purpose Hash Function Algorithms Library is *
* permitted under the guidelines and in accordance with the most current *
* version of the Common Public License. *
* http://www.opensource.org/licenses/cpl1.0.php *
* *
**************************************************************************
*/
unsigned int APHash(char* str, unsigned int len)
{
unsigned int hash = 0xAAAAAAAA;
unsigned int i = 0;
for (i = 0; i < len; str++, i++)
{
hash ^= ((i & 1) == 0) ? ((hash << 7) ^ (*str) * (hash >> 3)) :
(~((hash << 11) + ((*str) ^ (hash >> 5))));
}
return hash;
}
}
struct hash_key
{
size_t operator()(const key& _key) const
{
return APHash((char*)_key.start, _key.len);
}
};
struct equal_key
{
std::size_t operator()(const key& key1, const key& key2) const
{
return ((key1.len == key2.len)
&& memcmp(key1.start, key2.start, key1.len) == 0);
}
};
typedef std::unordered_map<key, cvalue, hash_key, equal_key> key_value_map;
class byte_str
{
public:
byte_str(byte* buffer, size_t len)
: m_buffer(buffer)
, m_len(len)
{
memset(m_vals, 0, sizeof(m_vals));
m_maxSubLen = m_len / 2 + ((m_len & 1) ? 1 : 0);
}
~byte_str()
{
if (m_json)
json_object_put(m_json);
}
json_object* process()
{
if (m_maps.empty())
{
m_json = json_object_new_object();
_process_single_byte();
_process_bytes();
}
return m_json;
}
private:
void _process_single_byte()
{
for (size_t indeks = 0; indeks < m_len; ++indeks)
m_vals[m_buffer[indeks]] += 1;
for (size_t indeks = 0; indeks < m_len; ++indeks)
{
if (m_vals[indeks] != 0)
{
if (m_vals[indeks] > 1)
{
char buffer[2] = {};
buffer[0] = indeks;
json_object_object_add(m_json, buffer, json_object_new_int(m_vals[indeks]));
}
}
}
}
void _process_bytes()
{
// for every possible iteration
// it is impossible to have repeatable substring with sub len > max_len/2
// so discard it
for (size_t iter = 1; iter <= m_maxSubLen; ++iter)
{
auto sub_len = iter + 1;
// for every possible substring
// stop if start (pos + len) exceeded sub_len
// do the first check really necessary?
for (size_t indeks = 0;
(indeks < m_len) && (m_len - indeks >= sub_len);
++indeks)
{
auto end = indeks + sub_len;
// start from 2 chars as 1 char already processed;
// check for every indeks_start -> end inside buffer
for (int indeks_start = indeks; indeks_start < end; ++indeks_start)
{
auto found = m_maps.find({ &m_buffer[indeks_start], sub_len });
if (found == m_maps.end())
m_maps.insert(std::make_pair(key(&m_buffer[indeks_start], sub_len), cvalue(new value(indeks_start, sub_len))));
else
{
// as long as it is not intersect with previous value
// for example with this string (len == 3)
// 0xFF 0FF 0xFF 0xFF ...
// 1st: 0xFF 0FF 0xFF (0xFF)
// 2nd: (0xFF) 0xFF 0FF 0xFF (intersected, hence not valid sequence)
// but I believe this case is (might be) rare.
if (!(found->second.top()->intersect(&value(indeks_start, sub_len))))
found->second.push(new value(indeks_start, sub_len));
}
}
}
}
for (auto& item : m_maps)
{
if (item.second.size() > 1)
{
char* buffer = new char[item.first.len + 1];
buffer[item.first.len] = '\0';
memcpy(buffer, item.first.start, item.first.len);
json_object_object_add(m_json, buffer, json_object_new_int(item.second.size()));
delete[] buffer;
}
}
}
key_value_map m_maps;
size_t m_vals[256]; // hold values
byte* m_buffer;
size_t m_len;
size_t m_maxSubLen;
json_object* m_json;
};
int main()
{
byte val[] = "this this a test";
byte_str bytestr(val, strlen((char*)val));
auto json = bytestr.process();
printf(json_object_to_json_string(json));
}
@iamarkdev
Copy link

Byte processing looks to be O(n^2)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment