Writing files block-wise
Please consider that your first and second version leads to different results. I will focus on the second version here. This version does compared to block-wise writing not only have a significant memory overhead, but is also slower than splitting the process in multiple chunks.
Example
def write_method_2(file_name,A,b,c):
n=A.shape[0]
dtype = np.dtype([
('A', ('<f', (3, 3))),
('b', ('<f', 3)),
('c', '<H'),
])
data = np.empty(n, dtype=dtype)
data["A"] = A
data["b"] = b
data["c"] = c
with open(file_name, "wb") as fh:
data.tofile(fh)
The only drawback is longer code... With a generator function it should be also possible to generalize this for multiple IO operations.
def write_method_3(file_name,A,b,c):
n=A.shape[0]
blk_size=10_000
dtype = np.dtype([
('A', ('<f', (3, 3))),
('b', ('<f', 3)),
('c', '<H'),
])
data = np.empty(blk_size, dtype=dtype)
with open(file_name, "wb") as fh:
#write block-wise
n_full_blocks=n//blk_size
for i in range(n_full_blocks):
data["A"] = A[i*blk_size:i*blk_size+blk_size]
data["b"] = b[i*blk_size:i*blk_size+blk_size]
data["c"] = c[i*blk_size:i*blk_size+blk_size]
data.tofile(fh)
#write remainder
n_full_blocks=n//blk_size
data=data[:n-n_full_blocks*blk_size]
data["A"] = A[n_full_blocks*blk_size:]
data["b"] = b[n_full_blocks*blk_size:]
data["c"] = c[n_full_blocks*blk_size:]
data.tofile(fh)
Edit
This is a more general way to write data from several nd-arrays to a file using a non-simple datatype.
def write_method_3_gen(fh,dtype,tuple_of_arr,blk_size=500_000):
"""
fh file-handle
dtype some non-simple dtype
tuple_of_arr tuple of arrays
blk_size size of a block, default 0.5MB
"""
n=tuple_of_arr[0].shape[0]
blk_size=blk_size//dtype.itemsize
data = np.empty(blk_size, dtype=dtype)
#write block-wise
n_full_blocks=n//blk_size
for i in range(n_full_blocks):
for j in range(len(tuple_of_arr)):
data[keys[j]] = tuple_of_arr[j][i*blk_size:i*blk_size+blk_size]
data.tofile(fh)
#write remainder
n_full_blocks=n//blk_size
data=data[:n-n_full_blocks*blk_size]
for j in range(len(tuple_of_arr)):
data[keys[j]] = tuple_of_arr[j][n_full_blocks*blk_size:]
data.tofile(fh)
Timings
import numpy as np
import time
n = 10_000_000 # a large number
A = np.random.rand(n, 3, 3)
b = np.random.rand(n, 3)
c = np.ones(n, dtype=int)
t1=time.time()
write_method_2("out_2.dat",A,b,c)
print(time.time()-t1)
#3.7440097332000732
#with blk_size=10_000 this has only 0.5MB memory overhead,
#which stays constant, even on much larger examples
t1=time.time()
write_method_3("out_3.dat",A,b,c)
print(time.time()-t1)
#0.8538124561309814
pickle? And why do you need to writeA,b,cby elements, notwrite(A), write(b), write(c)?