Search code examples
pythonhdf5

How to read lines with a specified interval from a data file?


I need to read a test file and store the data in a new HDF5 file format. I've done this successfully till now, but now I have a need to split the data in different groups. Let me explain. The data file looks something like this:

Generated by trjconv : P/L=1/400 t=   0.00000
11214
    1P1     aP1    1  80.48  35.36   4.25
    2P1     aP1    2  37.45   3.92   3.96
    3P2     aP2    3  18.53  -9.69   4.68
    4P2     aP2    4  55.39  74.34   4.60
    5P3     aP3    5  22.11  68.71   3.85
    6P3     aP3    6  -4.13  24.04   3.73
    7P4     aP4    7  40.16   6.39   4.73
    8P4     aP4    8  -5.40  35.73   4.85
    9P5     aP5    9  36.67  22.45   4.08
   10P5     aP5   10  -3.68 -10.66   4.18
   11P6     aP6   11  35.95  36.43   5.15
   12P6     aP6   12  57.17   3.88   5.08
   13P7     aP7   13 -23.64  50.44   4.32
   14P7     aP7   14   6.78   8.24   4.36
   15LI     aLI   15  21.34  50.63   5.21
   16LI     aLI   16  16.29  -1.34   5.28
   17LI     aLI   17  22.26  71.25   5.40
   18LI     aLI   18  19.76  10.38   5.34
   19LI     aLI   19  78.62  11.13   5.69
   20LI     aLI   20  22.14  59.70   4.92
   21LI     aLI   21  15.65  47.28   5.22
   22LI     aLI   22  82.41   2.09   5.24
   23LI     aLI   23  16.87 -11.68   5.35

You can see from the first column every row has their unique id . Till now I was considering it one dataset, but now I need to split the rows with id *P and with id LI into separate groups. I've the running code for the whole dataset, but I'm not sure if I can modify to solve present problem. code

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import struct
import numpy as np
import h5py
import re

csv_file = 'com'
fmtstring = '7s 8s 5s 7s 7s 7s'
fieldstruct = struct.Struct(fmtstring)
parse = fieldstruct.unpack_from
# Format for footer
fmtstring1 = '10s 10s 10s'
fieldstruct1 = struct.Struct(fmtstring1)
parse1 = fieldstruct1.unpack_from

with open(csv_file, 'r') as f, \
    h5py.File('test.h5', 'w') as hdf:
    ## Particles group with the attributes
    particles_grp = hdf.require_group('particles/lipids/box')
    particles_grp.attrs['dimension'] = 3
    particles_grp.attrs['boundary'] = ['periodic', 'periodic', 'periodic']
    pos_grp = particles_grp.require_group('positions')
    edge_grp = particles_grp.require_group('edges')
    ## h5md group with the attributes
    h5md_grp = hdf.require_group('h5md')
    h5md_grp.attrs['version'] = 1.0
    author_grp = h5md_grp.require_group('author')
    author_grp.attrs['author'] = 'foo', 'email=foo@googlemail.com'
    creator_grp = h5md_grp.require_group('creator')
    creator_grp.attrs['name'] = 'foo'
    creator_grp.attrs['version'] = 1.0
    # datasets with known sizes
    ds_time = pos_grp.create_dataset('time', dtype="f", shape=(0,),
                                           maxshape=(None,), compression='gzip', 
                                           shuffle=True)
    ds_step = pos_grp.create_dataset('step', dtype=np.uint64, shape=(0,),
                                           maxshape=(None,), compression='gzip',
                                           shuffle=True)
    ds_protein = None
    ds_lipid = None
    # datasets in edge group
    edge_ds_time = edge_grp.create_dataset('time', dtype="f", shape=(0,),
                                           maxshape=(None,), compression='gzip', 
                                           shuffle=True)
    edge_ds_step = edge_grp.create_dataset('step', dtype="f", shape=(0,),
                                           maxshape=(None,), compression='gzip', 
                                           shuffle=True)
    edge_ds_value = None

    edge_data = edge_grp.require_dataset('box_size', dtype=np.float32, shape=(0,3),
                                              maxshape=(None,3), compression='gzip', 
                                              shuffle=True)
    
    step = 0
    while True:
        header = f.readline()
        m = re.search("t= *(.*)$", header)
        if m:
            time = float(m.group(1))
        else:
            print("End Of File")
            break

        # get number of data rows, i.e., number of particles
        nparticles = int(f.readline())
        # read data lines and store in array
        arr = np.empty(shape=(nparticles, 3), dtype=np.float32)
        for row in range(nparticles):
            fields = parse( f.readline().encode('utf-8') )
            arr[row] = np.array((float(fields[3]), float(fields[4]), float(fields[5])))
        if nparticles > 0:
            # create a resizable dataset upon the first iteration
            if not ds_lipid:
## It is reading the whole dataset
                ds_protein = pos_grp.create_dataset('lipid', dtype=np.float32,
                                                        shape=(0, nparticles, 3), maxshape=(None, nparticles, 3),
                                                        chunks=(1, nparticles, 3), compression='gzip', shuffle=True)
                edge_ds_value = edge_grp.create_dataset('value', dtype=np.float32,
                                                        shape=(0, 3), maxshape=(None, 3),chunks=(1, 3), compression='gzip', shuffle=True)
            # append this sample to the datasets
            ds_time.resize(step + 1, axis=0)
            ds_step.resize(step + 1, axis=0)
            ds_protein.resize(step + 1, axis=0)
            # append the datasets in edge group
            edge_ds_time.resize(step + 1, axis=0)
            edge_ds_step.resize(step + 1, axis=0)
            edge_ds_value.resize(step + 1, axis=0)
            
            ds_time[step] = time
            ds_step[step] = step
            ds_protein[step] = arr
            edge_ds_time[step] = time
            edge_ds_step[step] = step

        footer = parse1( f.readline().encode('utf-8') )
        dat = np.array(footer).astype(float).reshape(1,3)
        new_size = edge_data.shape[0] + 1
        edge_data.resize(new_size, axis=0)
        edge_data[new_size-1 : new_size, :] = dat
        step += 1
        #=============================================================================

Let me explain the code a bit. nparticles read the whole file line by line and stores them in the ds_protein. Total number of lines are 11214 in one frame and then it repeats in 10 frames, which I've not shown here. To be precise I want only lines with P id to be read in ds_protein, which are only 14 in one dataset and rest 11200 in ds_lipid. Are there any way to use the indexing or any condition to do that, because I dont want to split the textfile?


Solution

  • First, a word of warning...you defined ds_protein = None on Line 45, then redefine when you create the dataset on Line 81 (and the same for ds_lipid = None). Not sure this will work as you desire. See comments later about checking existence of datasets.

    Currently you add all of the data to the array arr, then load ds_protein from that array. Since you don't save the first column of data, you need to use @white's suggestion: check the value in fields[0] as you read each line. Line 75 in your code parses the data into the fields variable. The first column on each line becomes fields[0]. Check that value. If it has 'P', add it to an array for the ds_protein dataset. If it has 'LI', add it an array for the ds_lipid dataset.

    Once you have the protein and lipid data, you need to modify the method to create the datasets. Right now you use .create_dataset() in a while loop, which will issue an error the second time thru the loop. To avoid this, I added a check on each dataset name in the appropriate group.

    Modified code below.

        # get number of data rows, i.e., number of particles
        nparticles = int(f.readline())
        # read data lines and store in array
        arr_protein = np.empty(shape=(nparticles, 3), dtype=np.float32)
        arr_lipid = np.empty(shape=(nparticles, 3), dtype=np.float32)
        protein_cnt, lipid_cnt = 0, 0
        for row in range(nparticles):
            fields = parse( f.readline().encode('utf-8') )
            if 'P' in str(fields[0]):
                arr_protein[protein_cnt] = np.array((float(fields[3]), float(fields[4]), float(fields[5])))
                protein_cnt += 1
            elif 'LI' in str(fields[0]):
                arr_lipid[lipid_cnt] = np.array((float(fields[3]), float(fields[4]), float(fields[5])))
                lipid_cnt += 1
    
        arr_protein = arr_protein[:protein_cnt,:]  ## New    
        arr_lipid = arr_lipid[:lipid_cnt,:]        ## New      
    
        if nparticles > 0:
            # create resizable datasets upon the first iteration
            if 'protein' not in pos_grp.keys():
                ds_protein = pos_grp.create_dataset('protein', dtype=np.float32,
                                                        shape=(0, protein_cnt, 3), maxshape=(None, protein_cnt, 3),
                                                        chunks=(1, protein_cnt, 3), compression='gzip', shuffle=True)
            if 'lipid' not in pos_grp.keys():
                ds_lipid   = pos_grp.create_dataset('lipid', dtype=np.float32,
                                                        shape=(0, lipid_cnt, 3), maxshape=(None, lipid_cnt, 3),                                                        
                                                        chunks=(1, lipid_cnt, 3), compression='gzip', shuffle=True)
            if 'value' not in edge_grp.keys():
                 edge_ds_value = edge_grp.create_dataset('value', dtype=np.float32,
                                                        shape=(0, 3), maxshape=(None, 3),
                                                        chunks=(1, 3), compression='gzip', shuffle=True)
            # append this sample to the datasets
            ds_time.resize(step + 1, axis=0)
            ds_step.resize(step + 1, axis=0)
            ds_protein.resize(step + 1, axis=0) ## Modified
            ds_lipid.resize(step + 1, axis=0)   ## Modified
            # append the datasets in edge group
            edge_ds_time.resize(step + 1, axis=0)
            edge_ds_step.resize(step + 1, axis=0)
            edge_ds_value.resize(step + 1, axis=0)
            
            ds_time[step] = time
            ds_step[step] = step
            ds_protein[step] = arr_protein ## Modified
            ds_lipid[step] = arr_lipid     ## Modified
            edge_ds_time[step] = time
            edge_ds_step[step] = step