Last active
August 5, 2024 14:48
-
-
Save ianmcook/79f5ed7c73b0663a35ae325b71f52630 to your computer and use it in GitHub Desktop.
Zero null-masked bytes of a fixed-width array in PyArrow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyarrow as pa | |
import numpy as np | |
import pandas as pd | |
# Create an array of some fixed-width type containing nulls | |
a = pa.array(obj=pd.Series([1, 2, 3]), type=pa.int64(), mask=np.array([1, 0, 1], dtype=bool)) | |
# Get the values buffer as a bytearray | |
b = a.buffers() | |
v = bytearray(b[1].to_pybytes()) | |
# For each null-masked value... | |
for i in range(0, len(a)): | |
if a.is_null()[i] == pa.scalar(True): | |
# ...locate the associated bytes in the bytearray | |
bytes_start = i * a.type.byte_width | |
bytes_end = bytes_start + a.type.byte_width | |
# Examine the bytes and notice they are not zeroed | |
print('Original bytes ' + v[bytes_start:bytes_end].hex(), end=' ') | |
# Replace them with zero bytes | |
v[bytes_start:bytes_end] = bytearray(bytes_length) | |
print('zeroed') | |
# Replace the values buffer and reassemble the array | |
b[1] = pa.py_buffer(v) | |
a_new = pa.IntegerArray.from_buffers(a.type, len(a), b) | |
# Notice PyArrow says the new array is the same as the original | |
a_new == a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment