Skip to content

Instantly share code, notes, and snippets.

@lemire
Created June 19, 2025 00:37
Show Gist options
  • Save lemire/0ba74779d89329bc13f7af1f9a3ed5c6 to your computer and use it in GitHub Desktop.
Save lemire/0ba74779d89329bc13f7af1f9a3ed5c6 to your computer and use it in GitHub Desktop.
## written with AI, but it looks good enough for a demo.
import re
def generate_bitsets_for_block(block, block_size, state):
# Initialize bitsets for this block (0s)
block_length = len(block)
comment_bits = [0] * block_length
line_ending_bits = [0] * block_length
semicolon_bits = [0] * block_length
whitespace_bits = [0] * block_length
string_bits = [0] * block_length
i = 0
while i < block_length:
char = block[i]
# Handle state transitions
if state['in_single_line_comment']:
comment_bits[i] = 1
if char == '\n':
state['in_single_line_comment'] = False
line_ending_bits[i] = 1
i += 1
continue
elif state['in_multi_line_comment']:
comment_bits[i] = 1
if char == '*' and i + 1 < block_length and block[i + 1] == '/':
comment_bits[i + 1] = 1
state['in_multi_line_comment'] = False
i += 2
else:
i += 1
continue
elif state['in_string']:
string_bits[i] = 1
if state['escape_next']:
state['escape_next'] = False
elif char == '\\':
state['escape_next'] = True
elif (char == '"' and state['in_string'] == 'double') or \
(char == '\'' and state['in_string'] == 'single'):
state['in_string'] = None
i += 1
continue
# Not in comment or string, check for new states
if char == '/' and i + 1 < block_length:
if block[i + 1] == '/':
state['in_single_line_comment'] = True
comment_bits[i] = comment_bits[i + 1] = 1
i += 2
continue
elif block[i + 1] == '*':
state['in_multi_line_comment'] = True
comment_bits[i] = comment_bits[i + 1] = 1
i += 2
continue
elif char == '"':
state['in_string'] = 'double'
string_bits[i] = 1
i += 1
continue
elif char == '\'':
state['in_string'] = 'single'
string_bits[i] = 1
i += 1
continue
# Handle other characters
if char == '\n':
line_ending_bits[i] = 1
elif char == ';':
semicolon_bits[i] = 1
elif char.isspace():
whitespace_bits[i] = 1
i += 1
# Pad bitsets to 128 bits if block is shorter
if block_length < block_size:
padding = [0] * (block_size - block_length)
comment_bits.extend(padding)
line_ending_bits.extend(padding)
semicolon_bits.extend(padding)
whitespace_bits.extend(padding)
string_bits.extend(padding)
return {
'comments': comment_bits,
'line_endings': line_ending_bits,
'semicolons': semicolon_bits,
'whitespace': whitespace_bits,
'strings': string_bits
}
def process_document(js_code, block_size=128):
# Initialize state
state = {
'in_single_line_comment': False,
'in_multi_line_comment': False,
'in_string': None, # None, 'single', or 'double'
'escape_next': False
}
# Process the input in blocks of 128 characters
block_idx = 0
for block_start in range(0, len(js_code), block_size):
block = js_code[block_start:block_start + block_size]
block_idx += 1
# Replace \n with | for display
display_block = block.replace('\n', '|')
# Print block content with 20-char label
print(f"Block {block_idx} content:".ljust(20) + display_block)
# Generate bitsets for this block
bitsets = generate_bitsets_for_block(block, block_size, state)
# Print bitsets with 20-char label, no commas, aligned
print("Bitsets:")
for feature, bits in bitsets.items():
# Convert bitset to string of 0s/1s without commas
bit_string = ''.join(str(bit) for bit in bits)
print(f"{feature} bitset:".ljust(20) + bit_string)
print() # Blank line between blocks
return state
# Example usage
if __name__ == "__main__":
# Large JSON example (with comments and semicolons)
js_code = """// Large JavaScript file for bitset indexing
// Generated on 2025-06-18
function processUsers() {
// Dataset of users with detailed info
const users = [
{
id: 1,
name: "Alice Marie Smith",
email: "[email protected]",
/* Contact details */
phone: "555-0101",
address: "123 Maple Street, Boston, MA",
active: true
};,
{
id: 2,
name: "Bob Edward Johnson",
email: "[email protected]",
phone: "555-0102",
address: "456 Oak Avenue, Chicago, IL",
active: false
};,
{
id: 3,
name: "Carol Ann Williams",
email: "[email protected]",
phone: "555-0103",
address: "789 Pine Road, Seattle, WA",
active: true
};,
{
id: 4,
name: "David Lee Brown",
email: "[email protected]",
phone: "555-0104",
/* Extended info */
address: "101 Elm Street, Austin, TX",
active: false,
notes: "Prefers email contact"
};,
{
id: 5,
name: "Emma Jane Davis",
email: "[email protected]",
phone: "555-0105",
address: "202 Birch Lane, Denver, CO",
active: true,
notes: 'Uses single quotes'
};,
{
id: 6,
name: "Frank Thomas Wilson",
email: "[email protected]",
phone: "555-0106",
address: "303 Cedar Court, Miami, FL",
active: false
};
];
// Process each user
for (let i = 0; i < users.length; i++) {
const user = users[i];
/* Log user info */
console.log("Processing user: " + user.name);
if (user.active) {
sendEmail(user.email, "Welcome back!", "You're active!");
} else {
console.log("Inactive user: " + user.email);
}
}
return users.length;
}
// Helper function to send emails
function sendEmail(to, subject, body) {
// Simulate email sending
console.log(`Sending email to ${to}`);
console.log(`Subject: "${subject}"`);
console.log(`Body: "${body}"`);
return true;
}
// Format address
function formatAddress(address) {
// Handle escaped quotes
const safeAddress = address.replace("\"", "\\\"");
return "Address: " + safeAddress;
}
/* Main execution */
function main() {
// Initialize app
console.log("Starting user processing...");
const count = processUsers();
console.log("Processed " + count + " users.");
// Example address formatting
const testAddress = "123 Maple Street, Boston, MA";
console.log(formatAddress(testAddress));
// Additional test data
const metadata = {
created: "2025-06-18",
version: "2.0",
author: "Grok 3",
/* App info */
description: "User management app"
};
console.log("Metadata: ", metadata);
// Loop to add more content
for (let i = 0; i < 3; i++) {
console.log("Iteration " + i + ";");
/* Spacer comment */
console.log(" Spacer line ");
}
}
// Run the app
main();"""
# Process the document
final_state = process_document(js_code)
print("Final state:", final_state)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment