Read data from binary file python

I have a binary file with this format:

and i use this code to open it:

import numpy as np

f = open("author_1", "r")

dt = np.dtype({'names': ['au_id','len_au_name','au_name','nu_of_publ', 'pub_id', 'len_of_pub_id','pub_title','num_auth','len_au_name_1', 'au_name1','len_au_name_2', 'au_name2','len_au_name_3', 'au_name3','year_publ','num_of_cit','citid','len_cit_tit','cit_tit', 'num_of_au_cit','len_cit_au_name_1','au_cit_name_1', len_cit_au_name_2',
'au_cit_name_2','len_cit_au_name_3','au_cit_name_3','len_cit_au_name_4',
'au_cit_name_4', 'len_cit_au_name_5','au_cit_name_5','year_cit'],
         'formats': [int,int,'S13',int,int,int,'S61',          int,int,'S8',int,'S7',int,'S12',int,int,int,int,'S50',int,int,
                'S7',int,'S7',int,'S9',int,'S8',int,'S1',int]})
a = np.fromfile(f, dtype=dt, count=-1, sep="")

And I take this:

array([ (1, 13, b'Scott Shenker', 200, 1, 61, b'Integrated services in the internet architecture: an overview', 3, 8, b'R Braden', 7, b'D Clark', 12, b'S Shenker\xe2\x80\xa6', 1994, 1000, 401, 50, b'[HTML] An architecture for differentiated services', 5, 7, b'D Black', 7, b'S Blake', 9, b'M Carlson', 8, b'E Davies', 1, b'Z', 1998),
(402, 72, b'Resource rese', 1952544370, 544108393, 1953460848, b'ocol (RSVP)--Version 1 functional specification\x05\x00\x00\x00\x08\x00\x00\x00R Brad', 487013, 541851648, b'Zhang\x08', 1109414656, b'erson\x08', 542310400, b'Herzog\x07\x00\x00\x00S ', 1768776010, 511342, 103168, 22016, b'\x00A reliable multicast framework for light-weight s', 1769173861, 544435823, b'and app', 1633905004, b'tion le', 543974774, b'framing\x04', 458752, b'\x00\x00S Floy', 2660, b'', 1632247894),

Any idea how can open the whole file?

Solution

I agree with Ryan: parsing the data is straightforward, but not trivial, and really tedious. Whatever disk space saving you gain by packing the data in this way, you pay it dearly at the hour of unpacking.

Anyway, the file is made of variable length records and fields. Each record is made of variable number and length of fields that we can read in chunks of bytes. Each chunk will have different format. You get the idea. Following this logic, I assembled these three functions, that you can finish, modify, test, etc:

from struct import Struct
import struct

def read_chunk(fmt, fileobj):
    chunk_struct = Struct(fmt)
    chunk = fileobj.read(chunk_struct.size)
    return chunk_struct.unpack(chunk)

def read_record(fileobj):
    author_id, len_author_name = read_chunk('ii', f)
    author_name, nu_of_publ = read_chunk(str(len_author_name)+'ci', f) # 's' or 'c' ?
    record = {  'author_id': author_id,
                'author_name': author_name,
                'publications': [] }
    for pub in range(nu_of_publ):
        pub_id, len_pub_title = read_chunk('ii', f)
        pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'ci', f)
        record['publications'].append({
                'publication_id': pub_id,
                'publication_title': pub_title,
                'publication_authors': [] })
        for auth in range(num_pub_auth):
            len_pub_auth_name = read_chunk('i', f)
            pub_auth_name = read_chunk(str(len_pub_auth_name)+'c', f)
            record['publications']['publication_authors'].append({'name': pub_auth_name})
        year_publ, nu_of_cit = read_chunk('ii', f)
        # Finish building your record with the remaining fields...
        for cit in range(nu_of_cit):
            cit_id, len_cit_title = read_chunk('ii', f)
            cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'ci', f)
        for cit_auth in range(num_cit_auth):
            len_cit_auth_name = read_chunk('i', f)
            cit_auth_name = read_chunk(str(len_cit_auth_name)+'c', f)
        year_cit_publ = read_chunk('i', f)
    return record

def parse_file(filename):
    records = []
    with open(filename, 'rb') as f:
        while True:
            try:
                records.append(read_record(f))
            except struct.error:
                break
    # do something useful with the records...