Search code examples
pythonbinaryfiles

Binary reading with python gives unexpected results


I'm trying to read some binary files with python for my analysis generated with Zemax OpticStudio. The structure of the file is supposed to be the following:

  • 2 x 32-bit integer as header
  • n chunks of data

Each chunk is made by

  • 32-bit integer indicating the number of C struc that come after
  • m C structures

The structures' definition is the following:

typedef struct
{
unsigned int status;
int level;
int hit_object;
int hit_face;
int unused;
int in_object;
int parent;
int storage;
int xybin, lmbin;
double index, starting_phase;
double x, y, z;
double l, m, n;
double nx, ny, nz;
double path_to, intensity;
double phase_of, phase_at;
double exr, exi, eyr, eyi, ezr, ezi;
}

which has a size of 208 bytes, for your convenience.

Here is the code that I wrote with some research and a couple of brilliant answers from here.


from pathlib import Path
from functools import partial
from io import DEFAULT_BUFFER_SIZE
import struct

def little_endian_int(x):
    return int.from_bytes(x,'little')

def file_byte_iterator(path):
    """iterator over lazily loaded file
    """
    path = Path(path)
    with path.open('rb') as file:
        reader = partial(file.read1, DEFAULT_BUFFER_SIZE)
        file_iterator = iter(reader, bytes())
        for chunk in file_iterator:
            yield from chunk

def ray_tell(rays_idcs:list,ray_idx:int,seg_idx:int):
    idx = rays_idcs[ray_idx][0]
    idx += 4 + 208*seg_idx
    return idx


def read_header(bytearr:bytearray):
    version = int.from_bytes(bytearr[0:4],'little')
    zrd_format = version//10000
    version = version%10000
    num_seg_max = int.from_bytes(bytearr[4:8],'little')
    return zrd_format,version,num_seg_max


def rays_indices(bytearr:bytearray):
    index=8
    rays=[]
    while index <len(bytearr):
        num_seg = int.from_bytes(bytearr[index:index+4],'little')
        rays.append((index,num_seg))
        index = index+4 + 208*num_seg
    return rays

def read_ray(bytearr:bytearray,ray):
    ray_idx,num_seg = ray
    data = []
    ray_idx = ray_idx + 4
    seg_idx=0
    for ray_idx in range(8,8+num_seg*208,208):
        offsets = [0,4,8,12,16,20,24,28,32,36,40,48,56,64,72,80,88,96,104,112,120,128,136,144,152,160,168,176,184,192,200]
        int_vars = offsets[0:11]
        doubl_vars = offsets[11:]
        data_integ = [bytearr[ray_idx+offset:ray_idx+offset+4] for offset in int_vars]
        data_doubl = [bytearr[ray_idx+offset:ray_idx+offset+8] for offset in doubl_vars]

        data.append([seg_idx,data_integ,data_doubl])
        seg_idx += 1
    return data



file="test_uncompressed.ZRD"

raypath = {}
filebin = bytearray(file_byte_iterator(file))
header = read_header(filebin)
print(header)
rays_idcs = rays_indices(filebin)
rays = []
for ray in rays_idcs:
    rays.append(read_ray(filebin,ray))
ray = rays[1] #Random ray
segm = ray[2] #Random segm
ints = segm[1]
doub = segm[2]
print("integer vars:")
for x in ints:
    print(x,little_endian_int(x))
print("double vars:")
for x in doub:
    print(x,struct.unpack('<d',x))     

I have verified that all of the structures have the right size and number of chunks and structures (my reading matches the number of segments and rays that I read with Zemax, ) , and thanks to the header, I verified the endianness of the file (little endian). My output is the following:

(0, 2002)
bytearray(b'\x1f\xd8\x9c?') 1067243551
bytearray(b'\x06\x80\x00\x00') 32774
bytearray(b'\x02\x00\x00\x00') 2
bytearray(b'\x11\x00\x00\x00') 17
bytearray(b'\x02\x00\x00\x00') 2
bytearray(b'\x00\x00\x00\x00') 0
bytearray(b'\x11\x00\x00\x00') 17
bytearray(b'\x01\x00\x00\x00') 1
bytearray(b'\x00\x00\x00\x00') 0
bytearray(b'\x00\x00\x00\x00') 0
double vars:
bytearray(b'\x00\x00\x00\x00# \xac\xe8') (-1.6425098109028998e+196,)
bytearray(b'\xe8\xe3\xf9?\x00\x00\x00\x00') (5.3030112e-315,)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00') (0.0,)
bytearray(b'\x00\x00\x00\x00p_\xb4\xec') (-4.389425605765071e+215,)
bytearray(b'5\xe3\x9d\xbf\xf0\xbd"\xa2') (-3.001836066957746e-144,)
bytearray(b'z"\xc0?\x00\x00\x00\x00') (5.28431047e-315,)
bytearray(b'\x00\x00\x00\x00 \xc9+\xa3') (-2.9165705864036956e-139,)
bytearray(b'g\xd4\xcd?\x9ch{ ') (3.2707669223572687e-152,)
bytearray(b'q\x1e\xef?\x00\x00\x00\x00') (5.299523535e-315,)
bytearray(b'\x00\x00\x00\x00%\x0c\xb4A') (336340224.0,)
bytearray(b'\t\xf2u\xbf\\3L\xe6') (-5.991371249309652e+184,)
bytearray(b'\xe1\xff\xef\xbf1\x8dV\x1e') (1.5664573023148095e-162,)
bytearray(b'\xa1\xe9\xe8?\x9c\x9a6\xfc') (-2.202825582975923e+290,)
bytearray(b'qV\xb9?\x00\x00\x00\x00') (5.28210966e-315,)
bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00') (0.0,)
bytearray(b'\x00\x00\x00\x00\xc6\xfd\x0c\xa1') (-1.7713316840526727e-149,)
bytearray(b'\x96\x94\x8d?\xad\xf9(\xcc') (-7.838624888507203e+58,)
bytearray(b'yN\xb2\xbff.\\\x1a') (1.0611651097687064e-181,)
bytearray(b'\xb9*\xae?\xac\xaf\xe5\xe1') (-3.90257774261585e+163,)
bytearray(b'c\xab\xd2\xbf\xccQ\x8bj') (1.7130904564012918e+205,)
bytearray(b'\xc8\xea\x8c\xbf\xdf\xdc\xe49') (8.22891935818188e-30,)

I'm reading correctly just the int values. I don't understand why I get those binaries for all the other variables

EDIT I want to highlight that bytearrays contain non-hexadecimal digits, and I'm sure that binary files are not corrupted, since I can read those in zemax


Solution

  • Solved. It was just an error in my pointer arithmetic in the read_ray function. Thanks to Mad Physicist for his suggestion to unpack the whole structure which put me in the right direction.

    def read_ray(bytearr:bytearray,ray):
        ray_idx,num_seg = ray
        data = []
        assert num_seg==little_endian_int(bytearr[ray_idx:ray_idx+4])
        ray_idx = ray_idx + 4
        for seg_ptr in range(ray_idx,ray_idx + num_seg*208,208):
            ...
            data_integ = [bytearr[seg_ptr+offset:seg_ptr+offset+4] for offset in int_vars]            
            data_doubl = [bytearr[seg_ptr+offset:seg_ptr+offset+8] for offset in doubl_vars]
            ...
        return data