write heterogeneous numpy arrays to binary files

Question

I have a large number n of 3x3-matrices, vectors of length 3, and ints which I need to write to a file in a given binary format. I could easily enough use for loop to fh.write() the items one after another, but this is slow. An alternative is to copy the data into an array with a special dtype. This is much faster, but creates a prohibitively large copy in memory:

import numpy as np

n = 100  # a large number
A = np.random.rand(n, 3, 3)
b = np.random.rand(n, 3)
c = np.ones(n, dtype=int)

# slow
with open("out.dat", "wb") as fh:
    for a_, b_, c_ in zip(A, b, c):
        fh.write(a_)
        fh.write(b_)
        fh.write(c_)

# memory-consuming
dtype = np.dtype([
  ('A', ('<f', (3, 3))),
  ('b', ('<f', 3)),
  ('c', '<H'),
])
data = np.empty(n, dtype=dtype)
data["A"] = A
data["b"] = b
data["c"] = c
with open("out.dat", "wb") as fh:
    data.tofile(fh)

Is there a fast, memory-efficient alternative here?

Any reason you stick with numpy but not something like pickle? And why do you need to write A,b,c by elements, not write(A), write(b), write(c)? — Quang Hoang
– Quang Hoang, Commented Feb 22, 2021 at 14:25
Was it working? You could do quite the same for reading, but I would expect a smaller or no performance advantage (only in terms of memory usage). — max9111
– max9111, Commented Mar 5, 2021 at 17:01

max9111 · Accepted Answer · 2021-03-01 17:18:01Z

Writing files block-wise

Please consider that your first and second version leads to different results. I will focus on the second version here. This version does compared to block-wise writing not only have a significant memory overhead, but is also slower than splitting the process in multiple chunks.

Example

def write_method_2(file_name,A,b,c):
    n=A.shape[0]

    dtype = np.dtype([
      ('A', ('<f', (3, 3))),
      ('b', ('<f', 3)),
      ('c', '<H'),
    ])

    data = np.empty(n, dtype=dtype)
    data["A"] = A
    data["b"] = b
    data["c"] = c
    with open(file_name, "wb") as fh:
        data.tofile(fh)

The only drawback is longer code... With a generator function it should be also possible to generalize this for multiple IO operations.

def write_method_3(file_name,A,b,c):
    n=A.shape[0]
    blk_size=10_000

    dtype = np.dtype([
      ('A', ('<f', (3, 3))),
      ('b', ('<f', 3)),
      ('c', '<H'),
    ])

    data = np.empty(blk_size, dtype=dtype)
    with open(file_name, "wb") as fh:
        #write block-wise
        n_full_blocks=n//blk_size
        for i in range(n_full_blocks):
            data["A"] = A[i*blk_size:i*blk_size+blk_size]
            data["b"] = b[i*blk_size:i*blk_size+blk_size]
            data["c"] = c[i*blk_size:i*blk_size+blk_size]
            data.tofile(fh)
        #write remainder
        n_full_blocks=n//blk_size
        data=data[:n-n_full_blocks*blk_size]
        data["A"] = A[n_full_blocks*blk_size:]
        data["b"] = b[n_full_blocks*blk_size:]
        data["c"] = c[n_full_blocks*blk_size:]
        data.tofile(fh)

Edit

This is a more general way to write data from several nd-arrays to a file using a non-simple datatype.

def write_method_3_gen(fh,dtype,tuple_of_arr,blk_size=500_000):
    """
    fh             file-handle
    dtype          some non-simple dtype
    tuple_of_arr   tuple of arrays
    blk_size       size of a block, default 0.5MB
    """
    n=tuple_of_arr[0].shape[0]
    blk_size=blk_size//dtype.itemsize
    data = np.empty(blk_size, dtype=dtype)

    #write block-wise
    n_full_blocks=n//blk_size
    for i in range(n_full_blocks):
        for j in range(len(tuple_of_arr)):
            data[keys[j]] = tuple_of_arr[j][i*blk_size:i*blk_size+blk_size]
        data.tofile(fh)

    #write remainder
    n_full_blocks=n//blk_size
    data=data[:n-n_full_blocks*blk_size]
    for j in range(len(tuple_of_arr)):
        data[keys[j]] = tuple_of_arr[j][n_full_blocks*blk_size:]
    data.tofile(fh)

Timings

import numpy as np
import time

n = 10_000_000  # a large number
A = np.random.rand(n, 3, 3)
b = np.random.rand(n, 3)
c = np.ones(n, dtype=int)

t1=time.time()
write_method_2("out_2.dat",A,b,c)
print(time.time()-t1)
#3.7440097332000732

#with blk_size=10_000 this has only 0.5MB memory overhead, 
#which stays constant, even on much larger examples
t1=time.time()
write_method_3("out_3.dat",A,b,c)
print(time.time()-t1)
#0.8538124561309814

Collectives™ on Stack Overflow

write heterogeneous numpy arrays to binary files

1 Answer 1

Writing files block-wise

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Writing files block-wise

Comments

Your Answer

Sign up or log in

Post as a guest

Related