Skip to content

Instantly share code, notes, and snippets.

@secemp9
Last active March 1, 2025 08:29
Show Gist options
  • Save secemp9/38fa09787bdc99124b644cef9e12754a to your computer and use it in GitHub Desktop.
Save secemp9/38fa09787bdc99124b644cef9e12754a to your computer and use it in GitHub Desktop.
import random
# Generate random byte data
N = 8 # Number of chunks
byte_data = bytes(random.getrandbits(8) for _ in range(1024)) # Generate 1024 random bytes
# Split into N equal chunks using plain Python
chunk_size = len(byte_data) // N
split_data = [byte_data[i * chunk_size : (i + 1) * chunk_size] for i in range(N)]
# Sort the list of chunks as normal lists
sorted_chunks = sorted(split_data)
# Rejoin into a single stream
sorted_byte_array = b"".join(sorted_chunks)
# Convert to NumPy arrays for entropy and serial correlation calculations
byte_array = np.frombuffer(byte_data, dtype=np.uint8)
sorted_byte_array_np = np.frombuffer(sorted_byte_array, dtype=np.uint8)
# Recompute entropy and serial correlation for sorted data
sorted_entropy = entropy(sorted_byte_array_np)
sorted_serial_corr = serial_correlation(sorted_byte_array_np)
# Display results
original_entropy, sorted_entropy, original_serial_corr, sorted_serial_corr

okay so, for each I noticed:

sort_key.py output: (7.831844830495034, 7.831844830495034, 0.007238990436380451, 0.005180995806581773)

but it's actually wrong since the data is indeed sorted (can be compressed, etc) and we can see the correct serial correlation if we go over N chunk instead of byte wise

same for natural_sort.py: (7.831844830495034, 7.8057683129781035, 0.007238990436380451, -0.014762771757389138)

and for sort_chunk.py we can clearly see it work better: (7.799797108624771, 7.799797108624771, 0.0435945185010197, 0.9522599356796657)

import numpy as np
import math
# Generate random byte data
N = 8 # Number of chunks
byte_data = np.random.bytes(1024) # Generate 1024 random bytes
# Convert to numpy array for processing
byte_array = np.frombuffer(byte_data, dtype=np.uint8)
# Split into N equal chunks
split_data = np.array_split(byte_array, N)
# Sort each chunk
sorted_chunks = [np.sort(chunk) for chunk in split_data]
# Rejoin into a single stream
sorted_byte_array = np.concatenate(sorted_chunks)
# Function to calculate entropy (Shannon entropy)
def entropy(data):
counts = np.bincount(data, minlength=256)
probabilities = counts / np.sum(counts)
probabilities = probabilities[probabilities > 0] # Remove zero entries
return -np.sum(probabilities * np.log2(probabilities))
# Function to calculate serial correlation
def serial_correlation(data):
total_count = len(data)
a = np.array(data, np.float64)
b = np.roll(a, -1)
scct1 = np.sum(a * b)
scct2 = np.sum(a) ** 2
scct3 = np.sum(a * a)
scc = total_count * scct3 - scct2
if scc == 0:
return None
return (total_count * scct1 - scct2) / scc
# Compute entropy and serial correlation for original and sorted data
original_entropy = entropy(byte_array)
sorted_entropy = entropy(sorted_byte_array)
original_serial_corr = serial_correlation(byte_array)
sorted_serial_corr = serial_correlation(sorted_byte_array)
# Display results
original_entropy, sorted_entropy, original_serial_corr, sorted_serial_corr
import numpy as np
# Generate random byte data
N = 8 # Number of chunks
byte_data = np.random.bytes(1024) # Generate 1024 random bytes
# Convert to numpy array for processing
byte_array = np.frombuffer(byte_data, dtype=np.uint8)
# Split into N equal chunks
split_data = np.array_split(byte_array, N)
# Sort the chunks based on their first element
sorted_chunks = sorted(split_data, key=lambda chunk: chunk[0])
# Rejoin into a single stream
sorted_byte_array = np.concatenate(sorted_chunks)
# Function to calculate entropy (Shannon entropy)
def entropy(data):
counts = np.bincount(data, minlength=256)
probabilities = counts / np.sum(counts)
probabilities = probabilities[probabilities > 0] # Remove zero entries
return -np.sum(probabilities * np.log2(probabilities))
# Function to calculate serial correlation
def serial_correlation(data):
total_count = len(data)
a = np.array(data, np.float64)
b = np.roll(a, -1)
scct1 = np.sum(a * b)
scct2 = np.sum(a) ** 2
scct3 = np.sum(a * a)
scc = total_count * scct3 - scct2
if scc == 0:
return None
return (total_count * scct1 - scct2) / scc
# Compute entropy and serial correlation for original and sorted data
original_entropy = entropy(byte_array)
sorted_entropy = entropy(sorted_byte_array)
original_serial_corr = serial_correlation(byte_array)
sorted_serial_corr = serial_correlation(sorted_byte_array)
# Display results
original_entropy, sorted_entropy, original_serial_corr, sorted_serial_corr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment