Python decompression relative performance?
The low-hanging fruit
numpy.savez_compressed('AlaskaCoast.npz', arr)
arr = numpy.load('AlaskaCoast.npz')['arr_0']
Loading is 2.3x faster than your PIL-based code.
It uses zipfile.ZIP_DEFLATED
, see savez_compressed docu.
Your PIL code also has an unneeded copy: array(img)
should be asarray(img)
. It only costs 5% of the slow loading time. But after optimization this will be significant and you have to keep in mind which numpy operators create a copy.
Fast decompression
According to the zstd benchmarks, when optimizing for decompression lz4 is a good choice. Just plugging this into pickle gives another 2.4x gain and is only 30% slower than uncompressed pickling.
import pickle
import lz4.frame
# with lz4.frame.open('AlaskaCoast.lz4', 'wb') as f:
# pickle.dump(arr, f)
with lz4.frame.open('AlaskaCoast.lz4', 'rb') as f:
arr = pickle.load(f)
Benchmarks
method size load time
------ ---- ---------
original (PNG+PIL) 5.1M 7.1
np.load (compressed) 6.7M 3.1
pickle + lz4 7.1M 1.3
pickle (uncompressed) 601M 1.0 (baseline)
The load time was measured inside Python (3.7.3), using the minimum wall-clock time over 20 runs on my desktop. According to occasional glances at top
it always seemed to be running on a single core.
For the curious: profiling
I'm not sure if the Python version matters, most work is supposed to happen inside of C libraries. To validate this I've profiled the pickle + lz4
variant:
perf record ./test.py && perf report -s dso
Overhead Shared Object
60.16% [kernel.kallsyms] # mostly page_fault and alloc_pages_vma
27.53% libc-2.28.so # mainly memmove
9.75% liblz4.so.1.8.3 # only LZ4_decompress_*
2.33% python3.7
...
Most time is spent inside of the Linux kernel, doing page_fault
and stuff associated with (re-)allocating memory, probably including disk I/O. The high amount of memmove
looks suspicious. Probably Python is re-allocating (resizing) the final array every time a new decompressed chunk arrives. If anyone likes to have a closer look: python and perf profiles.
You can use Python-blosc
It is very fast and for small arrays (<2GB) also quite easy to use. On easily compressable data like your example, it is often faster to compress the data for IO operations. (SATA-SSD: about 500 MB/s, PCIe- SSD: up to 3500MB/s) In the decompression step the array allocation is the most costly part. If your images are of similar shape you can avoid repeated memory allocation.
Example
A contigous array is assumed for the following example.
import blosc
import pickle
def compress(arr,Path):
#c = blosc.compress_ptr(arr.__array_interface__['data'][0], arr.size, arr.dtype.itemsize, clevel=3,cname='lz4',shuffle=blosc.SHUFFLE)
c = blosc.compress_ptr(arr.__array_interface__['data'][0], arr.size, arr.dtype.itemsize, clevel=3,cname='zstd',shuffle=blosc.SHUFFLE)
f=open(Path,"wb")
pickle.dump((arr.shape, arr.dtype),f)
f.write(c)
f.close()
return c,arr.shape, arr.dtype
def decompress(Path):
f=open(Path,"rb")
shape,dtype=pickle.load(f)
c=f.read()
#array allocation takes most of the time
arr=np.empty(shape,dtype)
blosc.decompress_ptr(c, arr.__array_interface__['data'][0])
return arr
#Pass a preallocated array if you have many similar images
def decompress_pre(Path,arr):
f=open(Path,"rb")
shape,dtype=pickle.load(f)
c=f.read()
#array allocation takes most of the time
blosc.decompress_ptr(c, arr.__array_interface__['data'][0])
return arr
Benchmarks
#blosc.SHUFFLE, cname='zstd' -> 4728KB,
%timeit compress(arr,"Test.dat")
1.03 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#611 MB/s
%timeit decompress("Test.dat")
146 ms ± 481 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#4310 MB/s
%timeit decompress_pre("Test.dat",arr)
50.9 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#12362 MB/s
#blosc.SHUFFLE, cname='lz4' -> 9118KB,
%timeit compress(arr,"Test.dat")
32.1 ms ± 437 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#19602 MB/s
%timeit decompress("Test.dat")
146 ms ± 332 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#4310 MB/s
%timeit decompress_pre("Test.dat",arr)
53.6 ms ± 82.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#11740 MB/s
Edit
This version is more for general use. It does handle f-contiguous, c-contiguous and non-contiguous arrays and arrays >2GB. Also have a look at bloscpack.
import blosc
import pickle
def compress(file, arr,clevel=3,cname='lz4',shuffle=1):
"""
file path to file
arr numpy nd-array
clevel 0..9
cname blosclz,lz4,lz4hc,snappy,zlib
shuffle 0-> no shuffle, 1->shuffle,2->bitshuffle
"""
max_blk_size=100_000_000 #100 MB
shape=arr.shape
#dtype np.object is not implemented
if arr.dtype==np.object:
raise(TypeError("dtype np.object is not implemented"))
#Handling of fortran ordered arrays (avoid copy)
is_f_contiguous=False
if arr.flags['F_CONTIGUOUS']==True:
is_f_contiguous=True
arr=arr.T.reshape(-1)
else:
arr=np.ascontiguousarray(arr.reshape(-1))
#Writing
max_num=max_blk_size//arr.dtype.itemsize
num_chunks=arr.size//max_num
if arr.size%max_num!=0:
num_chunks+=1
f=open(file,"wb")
pickle.dump((shape,arr.size,arr.dtype,is_f_contiguous,num_chunks,max_num),f)
size=np.empty(1,np.uint32)
num_write=max_num
for i in range(num_chunks):
if max_num*(i+1)>arr.size:
num_write=arr.size-max_num*i
c = blosc.compress_ptr(arr[max_num*i:].__array_interface__['data'][0], num_write,
arr.dtype.itemsize, clevel=clevel,cname=cname,shuffle=shuffle)
size[0]=len(c)
size.tofile(f)
f.write(c)
f.close()
def decompress(file,prealloc_arr=None):
f=open(file,"rb")
shape,arr_size,dtype,is_f_contiguous,num_chunks,max_num=pickle.load(f)
if prealloc_arr is None:
if prealloc_arr.flags['F_CONTIGUOUS']==True
prealloc_arr=prealloc_arr.T
if prealloc_arr.flags['C_CONTIGUOUS']!=True
raise(TypeError("Contiguous array is needed"))
arr=np.empty(arr_size,dtype)
else:
arr=np.frombuffer(prealloc_arr.data, dtype=dtype, count=arr_size)
for i in range(num_chunks):
size=np.fromfile(f,np.uint32,count=1)
c=f.read(size[0])
blosc.decompress_ptr(c, arr[max_num*i:].__array_interface__['data'][0])
f.close()
#reshape
if is_f_contiguous:
arr=arr.reshape(shape[::-1]).T
else:
arr=arr.reshape(shape)
return arr
You can continue to use your existing PNGs and enjoy the space saving, but gain some speed by using libvips
. Here is a comparison, but rather than test the speed of my laptop versus yours, I have shown 3 different methods so you can see the relative speed. I used:
- PIL
- OpenCV
- pyvips
#!/usr/bin/env python3
import numpy as np
import pyvips
import cv2
from PIL import Image
def usingPIL(f):
im = Image.open(f)
return np.asarray(im)
def usingOpenCV(f):
arr = cv2.imread(f,cv2.IMREAD_UNCHANGED)
return arr
def usingVIPS(f):
image = pyvips.Image.new_from_file(f)
mem_img = image.write_to_memory()
imgnp=np.frombuffer(mem_img, dtype=np.uint8).reshape(image.height, image.width, 3)
return imgnp
Then I checked the performance in IPython because it has nice timing functions. As you can see, pyvips
is 13 times faster than PIL even with PIL 2x faster than the original version because of avoiding array copy:
In [49]: %timeit usingPIL('Alaska1.png')
3.66 s ± 31.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [50]: %timeit usingOpenCV('Alaska1.png')
6.82 s ± 23.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [51]: %timeit usingVIPS('Alaska1.png')
276 ms ± 4.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# Quick test results match
np.sum(usingVIPS('Alaska1.png') - usingPIL('Alaska1.png'))
0