Search code examples
pythonpython-3.xbinaryfiles

Read from binary file with Python 3.5


I use this piece of code:

from struct import Struct

import struct

def read_chunk(fmt, fileobj):

    chunk_struct = Struct(fmt)
    chunk = fileobj.read(chunk_struct.size)
    return chunk_struct.unpack(chunk)

def read_record(fileobj):
    author_id, len_author_name = read_chunk('ii', f)
    author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f) # 's' or 'c' ?
    record = {  'author_id': author_id,
                'author_name': author_name,
                'publications': [] }
    for pub in range(nu_of_publ):
        pub_id, len_pub_title = read_chunk('ii', f)
        pub_title, num_pub_auth = read_chunk(str(len_pub_title)+'si', f)
        record['publications'].append({
                'publication_id': pub_id,
                'publication_title': pub_title,
                'publication_authors': [] })
        for auth in range(num_pub_auth):
            len_pub_auth_name = read_chunk('i', f)
            pub_auth_name = read_chunk(str(len_pub_auth_name)+'s', f)
            record['publications']['publication_authors'].append({'name': pub_auth_name})
        year_publ, nu_of_cit = read_chunk('ii', f)
        # Finish building your record with the remaining fields...
        for cit in range(nu_of_cit):
            cit_id, len_cit_title = read_chunk('ii', f)
            cit_title, num_cit_auth = read_chunk(str(len_cit_title)+'si', f)
        for cit_auth in range(num_cit_auth):
            len_cit_auth_name = read_chunk('i', f)
            cit_auth_name = read_chunk(str(len_cit_auth_name)+'s', f)
        year_cit_publ = read_chunk('i', f)
    return record

def parse_file(filename):
    records = []
    with open(filename, 'rb') as f:
        while True:
            try:
                records.append(read_record(f))
            except struct.error:
                break

to read this file:

https://drive.google.com/open?id=0B3SYAHrxLP69NHlWc25KeXFHNVE

with this format:

spec

Inside the function read_record, it read correct variables author_id, len_author_name, author_name but the nu_of_publ and below variables aren't read correct.

Any idea what's wrong?

When i run this piece of code:

author_id, len_author_name = read_chunk('LL', f)
    author_name, nu_of_publ= read_chunk(str(len_author_name)+'sL', f)
    #nu_of_publ = read_chunk('I', f)# 's' or 'c' ?
    record = {  'author_id': author_id,
                'author_name': author_name,
                'publications': [] }
    print (record, nu_of_publ)


for pub in range(nu_of_publ):
        pub_id, len_pub_title = read_chunk('LL', f)
        print (pub_id, len_pub_title)

i take this result:

{'author_name': b'Scott Shenker', 'author_id': 1, 'publications': []} 256 15616 1953384704

but it will print 200 instead 256, 1 instead 15616 etc.


Solution

  • This format is not correct:

    author_name, nu_of_publ = read_chunk(str(len_author_name)+'si', f)
    

    You are defining a structure of N characters and an integer. Those structures are aligned, the same way as they would if you had the structure defined in :

    struct {
        char author_name[N];
        int nu_of_publ;
    };
    

    What alignment does is: it puts beginning of every int to a position which is a multiple of 4. This is done (in C) because CPUs are optimized for accessing such addresses.

    So, if author's name's length is 6, the next two bytes will be skipped before reading the next integer.

    One solution to separate the structures:

    author_name = read_chunk(str(len_author_name)+'s', f)
    nu_of_publ, = read_chunk('i', f)
    

    Note: The comma after nu_of_publ (nu_of_publ,) is to unpack the tuple returned by read_chunk.

    Another solution is to specify structure with = at the beginning, based on the table from spec:

    author_name, nu_of_publ = read_chunk('={}si'.format(len_author_name), f)