Created
June 19, 2025 00:37
-
-
Save lemire/0ba74779d89329bc13f7af1f9a3ed5c6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## written with AI, but it looks good enough for a demo. | |
import re | |
def generate_bitsets_for_block(block, block_size, state): | |
# Initialize bitsets for this block (0s) | |
block_length = len(block) | |
comment_bits = [0] * block_length | |
line_ending_bits = [0] * block_length | |
semicolon_bits = [0] * block_length | |
whitespace_bits = [0] * block_length | |
string_bits = [0] * block_length | |
i = 0 | |
while i < block_length: | |
char = block[i] | |
# Handle state transitions | |
if state['in_single_line_comment']: | |
comment_bits[i] = 1 | |
if char == '\n': | |
state['in_single_line_comment'] = False | |
line_ending_bits[i] = 1 | |
i += 1 | |
continue | |
elif state['in_multi_line_comment']: | |
comment_bits[i] = 1 | |
if char == '*' and i + 1 < block_length and block[i + 1] == '/': | |
comment_bits[i + 1] = 1 | |
state['in_multi_line_comment'] = False | |
i += 2 | |
else: | |
i += 1 | |
continue | |
elif state['in_string']: | |
string_bits[i] = 1 | |
if state['escape_next']: | |
state['escape_next'] = False | |
elif char == '\\': | |
state['escape_next'] = True | |
elif (char == '"' and state['in_string'] == 'double') or \ | |
(char == '\'' and state['in_string'] == 'single'): | |
state['in_string'] = None | |
i += 1 | |
continue | |
# Not in comment or string, check for new states | |
if char == '/' and i + 1 < block_length: | |
if block[i + 1] == '/': | |
state['in_single_line_comment'] = True | |
comment_bits[i] = comment_bits[i + 1] = 1 | |
i += 2 | |
continue | |
elif block[i + 1] == '*': | |
state['in_multi_line_comment'] = True | |
comment_bits[i] = comment_bits[i + 1] = 1 | |
i += 2 | |
continue | |
elif char == '"': | |
state['in_string'] = 'double' | |
string_bits[i] = 1 | |
i += 1 | |
continue | |
elif char == '\'': | |
state['in_string'] = 'single' | |
string_bits[i] = 1 | |
i += 1 | |
continue | |
# Handle other characters | |
if char == '\n': | |
line_ending_bits[i] = 1 | |
elif char == ';': | |
semicolon_bits[i] = 1 | |
elif char.isspace(): | |
whitespace_bits[i] = 1 | |
i += 1 | |
# Pad bitsets to 128 bits if block is shorter | |
if block_length < block_size: | |
padding = [0] * (block_size - block_length) | |
comment_bits.extend(padding) | |
line_ending_bits.extend(padding) | |
semicolon_bits.extend(padding) | |
whitespace_bits.extend(padding) | |
string_bits.extend(padding) | |
return { | |
'comments': comment_bits, | |
'line_endings': line_ending_bits, | |
'semicolons': semicolon_bits, | |
'whitespace': whitespace_bits, | |
'strings': string_bits | |
} | |
def process_document(js_code, block_size=128): | |
# Initialize state | |
state = { | |
'in_single_line_comment': False, | |
'in_multi_line_comment': False, | |
'in_string': None, # None, 'single', or 'double' | |
'escape_next': False | |
} | |
# Process the input in blocks of 128 characters | |
block_idx = 0 | |
for block_start in range(0, len(js_code), block_size): | |
block = js_code[block_start:block_start + block_size] | |
block_idx += 1 | |
# Replace \n with | for display | |
display_block = block.replace('\n', '|') | |
# Print block content with 20-char label | |
print(f"Block {block_idx} content:".ljust(20) + display_block) | |
# Generate bitsets for this block | |
bitsets = generate_bitsets_for_block(block, block_size, state) | |
# Print bitsets with 20-char label, no commas, aligned | |
print("Bitsets:") | |
for feature, bits in bitsets.items(): | |
# Convert bitset to string of 0s/1s without commas | |
bit_string = ''.join(str(bit) for bit in bits) | |
print(f"{feature} bitset:".ljust(20) + bit_string) | |
print() # Blank line between blocks | |
return state | |
# Example usage | |
if __name__ == "__main__": | |
# Large JSON example (with comments and semicolons) | |
js_code = """// Large JavaScript file for bitset indexing | |
// Generated on 2025-06-18 | |
function processUsers() { | |
// Dataset of users with detailed info | |
const users = [ | |
{ | |
id: 1, | |
name: "Alice Marie Smith", | |
email: "[email protected]", | |
/* Contact details */ | |
phone: "555-0101", | |
address: "123 Maple Street, Boston, MA", | |
active: true | |
};, | |
{ | |
id: 2, | |
name: "Bob Edward Johnson", | |
email: "[email protected]", | |
phone: "555-0102", | |
address: "456 Oak Avenue, Chicago, IL", | |
active: false | |
};, | |
{ | |
id: 3, | |
name: "Carol Ann Williams", | |
email: "[email protected]", | |
phone: "555-0103", | |
address: "789 Pine Road, Seattle, WA", | |
active: true | |
};, | |
{ | |
id: 4, | |
name: "David Lee Brown", | |
email: "[email protected]", | |
phone: "555-0104", | |
/* Extended info */ | |
address: "101 Elm Street, Austin, TX", | |
active: false, | |
notes: "Prefers email contact" | |
};, | |
{ | |
id: 5, | |
name: "Emma Jane Davis", | |
email: "[email protected]", | |
phone: "555-0105", | |
address: "202 Birch Lane, Denver, CO", | |
active: true, | |
notes: 'Uses single quotes' | |
};, | |
{ | |
id: 6, | |
name: "Frank Thomas Wilson", | |
email: "[email protected]", | |
phone: "555-0106", | |
address: "303 Cedar Court, Miami, FL", | |
active: false | |
}; | |
]; | |
// Process each user | |
for (let i = 0; i < users.length; i++) { | |
const user = users[i]; | |
/* Log user info */ | |
console.log("Processing user: " + user.name); | |
if (user.active) { | |
sendEmail(user.email, "Welcome back!", "You're active!"); | |
} else { | |
console.log("Inactive user: " + user.email); | |
} | |
} | |
return users.length; | |
} | |
// Helper function to send emails | |
function sendEmail(to, subject, body) { | |
// Simulate email sending | |
console.log(`Sending email to ${to}`); | |
console.log(`Subject: "${subject}"`); | |
console.log(`Body: "${body}"`); | |
return true; | |
} | |
// Format address | |
function formatAddress(address) { | |
// Handle escaped quotes | |
const safeAddress = address.replace("\"", "\\\""); | |
return "Address: " + safeAddress; | |
} | |
/* Main execution */ | |
function main() { | |
// Initialize app | |
console.log("Starting user processing..."); | |
const count = processUsers(); | |
console.log("Processed " + count + " users."); | |
// Example address formatting | |
const testAddress = "123 Maple Street, Boston, MA"; | |
console.log(formatAddress(testAddress)); | |
// Additional test data | |
const metadata = { | |
created: "2025-06-18", | |
version: "2.0", | |
author: "Grok 3", | |
/* App info */ | |
description: "User management app" | |
}; | |
console.log("Metadata: ", metadata); | |
// Loop to add more content | |
for (let i = 0; i < 3; i++) { | |
console.log("Iteration " + i + ";"); | |
/* Spacer comment */ | |
console.log(" Spacer line "); | |
} | |
} | |
// Run the app | |
main();""" | |
# Process the document | |
final_state = process_document(js_code) | |
print("Final state:", final_state) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment