Search code examples
python-3.xpandasdataframehdf5

Deleting specific indices in an hdf5 file corresponding to certain criteria


I have an hdf5 file of about 735MB that has the structure as mentioned below. I have to filter out my data corresponding to a certain criteria, however, I am facing an operational problem. The dataset tracks_from_jet has certain variables that I want to place my selection cuts on (let's say I want to select one variable >= 500), and I have to remove those corresponding records/data by index that do not satisfy the criteria (corresponding to variable >=500) in the jets dataset.

The tracks_from_jet dataset's first index has a 1-1 correspondence with the jets index. How do I remove a record from the jets dataset corresponding to my selection criteria in the tracks_from_jet dataset?

The h5 file structure is:

jets                     Dataset {679015/Inf}
    Location:  1:800
    Links:     1
    Chunks:    {2048} 671744 bytes
    Storage:   222716920 logical bytes, 110070578 allocated bytes, 202.34% utilization
    Filter-0:  deflate-1 OPT {7}
    Type:      struct {
                   "pt_btagJes"       +0    native float
                   "eta_btagJes"      +4    native float
                   "absEta_btagJes"   +8    native float
                   "JetFitter_energyFraction" +12   native float
                   "JetFitter_mass"   +16   native float
                   "JetFitter_significance3d" +20   native float
                   "JetFitter_deltaphi" +24   native float
                   "JetFitter_deltaeta" +28   native float
                   "JetFitter_massUncorr" +32   native float
                   "JetFitter_dRFlightDir" +36   native float
                   "SV1_masssvx"      +40   native float
                   "SV1_efracsvx"     +44   native float
                   "SV1_significance3d" +48   native float
                   "SV1_correctSignificance3d" +52   native float
                   "SV1_dstToMatLay"  +56   native float
                   "SV1_deltaR"       +60   native float
                   "SV1_Lxy"          +64   native float
                   "SV1_L3d"          +68   native float
                   "JetFitter_deltaR" +72   native float
                   "JetFitterSecondaryVertex_displacement3d" +76   native float
                   "JetFitterSecondaryVertex_displacement2d" +80   native float
                   "JetFitterSecondaryVertex_mass" +84   native float
                   "JetFitterSecondaryVertex_energy" +88   native float
                   "JetFitterSecondaryVertex_energyFraction" +92   native float
                   "JetFitterSecondaryVertex_minimumTrackRelativeEta" +96   native float
                   "JetFitterSecondaryVertex_maximumTrackRelativeEta" +100  native float
                   "JetFitterSecondaryVertex_averageTrackRelativeEta" +104  native float
                   "JetFitterSecondaryVertex_maximumAllJetTrackRelativeEta" +108  native float
                   "JetFitterSecondaryVertex_minimumAllJetTrackRelativeEta" +112  native float
                   "JetFitterSecondaryVertex_averageAllJetTrackRelativeEta" +116  native float
                   "IP2D_pu"          +120  native float
                   "IP2D_pc"          +124  native float
                   "IP2D_pb"          +128  native float
                   "IP3D_pu"          +132  native float
                   "IP3D_pc"          +136  native float
                   "IP3D_pb"          +140  native float
                   "IP2D_cu"          +144  native float
                   "IP2D_bu"          +148  native float
                   "IP2D_bc"          +152  native float
                   "IP3D_cu"          +156  native float
                   "IP3D_bu"          +160  native float
                   "IP3D_bc"          +164  native float
                   "rnnip_pu"         +168  native float
                   "rnnip_pc"         +172  native float
                   "rnnip_pb"         +176  native float
                   "DL1r_pu"          +180  native float
                   "DL1r_pc"          +184  native float
                   "DL1r_pb"          +188  native float
                   "IP2D_isDefaults"  +192  native int
                   "IP3D_isDefaults"  +196  native int
                   "JetFitter_isDefaults" +200  native int
                   "SV1_isDefaults"   +204  native int
                   "JetFitterSecondaryVertex_isDefaults" +208  native int
                   "rnnip_isDefaults" +212  native int
                   "JetFitter_nVTX"   +216  native float
                   "JetFitter_nSingleTracks" +220  native float
                   "JetFitter_nTracksAtVtx" +224  native float
                   "JetFitter_N2Tpair" +228  native float
                   "SV1_N2Tpair"      +232  native float
                   "SV1_NGTinSvx"     +236  native float
                   "JetFitterSecondaryVertex_nTracks" +240  native float
                   "IP2D_nTrks"       +244  native float
                   "IP3D_nTrks"       +248  native float
                   "pt"               +252  native float
                   "eta"              +256  native float
                   "energy"           +260  native float
                   "mass"             +264  native float
                   "GhostBHadronsFinalPt" +268  native float
                   "bTagJVT"          +272  native float
                   "GhostBHadronsFinalCount" +276  native int
                   "GhostCHadronsFinalCount" +280  native int
                   "HadronConeExclTruthLabelID" +284  native int
                   "HadronConeExclExtendedTruthLabelID" +288  native int
                   "PartonTruthLabelID" +292  native int
                   "jetPtRank"        +296  native int
                   "mcEventWeight"    +300  native float
                   "eventNumber"      +304  native long
                   "averageInteractionsPerCrossing" +312  native float
                   "actualInteractionsPerCrossing" +316  native float
                   "nPrimaryVertices" +320  native int
                   "beamSpotWeight"   +324  native float
               } 328 bytes
tracks_from_jet          Dataset {679015/Inf, 40/40}
    Location:  1:7832
    Links:     1
    Chunks:    {2048, 40} 9338880 bytes
    Storage:   3096308400 logical bytes, 661050378 allocated bytes, 468.39% utilization
    Filter-0:  deflate-1 OPT {7}
    Type:      struct {
                   "chiSquared"       +0    native float
                   "numberDoF"        +4    native float
                   "radiusOfFirstHit" +8    native float
                   "IP3D_signed_d0"   +12   native float
                   "IP2D_signed_d0"   +16   native float
                   "IP3D_signed_z0"   +20   native float
                   "theta"            +24   native float
                   "qOverP"           +28   native float
                   "numberOfInnermostPixelLayerHits" +32   native unsigned char
                   "numberOfNextToInnermostPixelLayerHits" +33   native unsigned char
                   "numberOfInnermostPixelLayerSharedHits" +34   native unsigned char
                   "numberOfInnermostPixelLayerSplitHits" +35   native unsigned char
                   "numberOfPixelHits" +36   native unsigned char
                   "numberOfPixelHoles" +37   native unsigned char
                   "numberOfPixelSharedHits" +38   native unsigned char
                   "numberOfPixelSplitHits" +39   native unsigned char
                   "numberOfSCTHits"  +40   native unsigned char
                   "numberOfSCTHoles" +41   native unsigned char
                   "numberOfSCTSharedHits" +42   native unsigned char
                   "expectNextToInnermostPixelLayerHit" +43   native unsigned char
                   "expectInnermostPixelLayerHit" +44   native unsigned char
                   "d0"               +45   native float
                   "z0SinTheta"       +49   native float
                   "d0Uncertainty"    +53   native float
                   "z0SinThetaUncertainty" +57   native float
                   "IP3D_signed_d0_significance" +61   native float
                   "IP3D_signed_z0_significance" +65   native float
                   "pt"               +69   native float
                   "eta"              +73   native float
                   "phiUncertainty"   +77   native float
                   "thetaUncertainty" +81   native float
                   "qOverPUncertainty" +85   native float
                   "deta"             +89   native float
                   "dphi"             +93   native float
                   "dr"               +97   native float
                   "ptfrac"           +101  native float
                   "z0RelativeToBeamspot" +105  native float
                   "z0RelativeToBeamspotUncertainty" +109  native float
                   "valid"            +113  enum native signed char {
                       TRUE             = 1
                       FALSE            = 0
                   }
               } 114 bytes

Solution

  • So here's a prototype that works. It's not memory efficient, but on a system (say i9, 5.3GHz with 24GB NVIDIA RTX-3090 GPU Mem and 32GB RAM), it takes about 30 seconds to achieve each iteration. It does, however, crash on regular laptop resources.

    import h5py
    import numpy as np
    import os
    import glob
    
    #directory=os.getcwd()
    results = glob.glob('std/*output.h5',recursive=True)
    #results = glob.glob('lrt/*output.h5',recursive=True)
    #print(results)
    
    path_std = '../ttbar/std/'
    path_lrt = '../ttbar/lrt/'
    
    isExist_std = os.path.exists(path_std)
    
    if not isExist_std:
        os.makedirs(path_std)
        print("The new std-directory is created!\n")
    
    isExist_lrt = os.path.exists(path_lrt)
    
    if not isExist_lrt:
        os.makedirs(path_lrt)
        print("The new lrt-directory is created!\n")
    
    for f in results:
        print("Running cuts on:"+str(f)+"\n")
        with h5py.File(f, "r") as h5r:
            key_jets = list(h5r.keys())[0]
            key_tracks = list(h5r.keys())[1]
            # print (key_jets, h5r[key_jets].shape, h5r[key_jets].dtype)
            # print (key_tracks, h5r[key_tracks].shape, h5r[key_tracks].dtype)
            jets = np.array(h5r['jets'])
            print(jets.shape)
            tracks = np.array(h5r['tracks_from_jet'])
            print(tracks.shape)
            print(tracks.shape[0])
            print(tracks.shape[1])
            #print(tracks.shape[2])
            tracks_d0 = h5r['tracks_from_jet']['d0']
            tracks_z0 = h5r['tracks_from_jet']['z0SinTheta']
            npy_tracks_d0 = np.array(tracks_d0)
            npy_tracks_z0 = np.array(tracks_z0)
            # print(npy_jets)
            # print(npy_tracks_d0)
            # print(npy_tracks_z0)
            sol_d0 = np.argwhere(abs(npy_tracks_d0) > 1)
            sol_z0 = np.argwhere(abs(npy_tracks_z0) > 1.5)
            sold0 = sol_d0[0]
            solz0 = sol_z0[0]
            indices_to_remove = np.unique(np.concatenate((sold0, solz0)))
            if indices_to_remove.size == 0:
                continue
            print(indices_to_remove)
            #for i in indices_to_remove:
            #    print(jets[i])
            #for j in indices_to_remove:
            #    print(tracks[j][:][:])
            #    print('====================================================================================================================================================================')
    
            jets = np.delete(jets, indices_to_remove, axis=0)
            tracks = np.delete(tracks, indices_to_remove, axis=0)
    
            newfile = 'refined' + os.path.basename(f)
            completeName = os.path.join(path_std, newfile)
            #completeName = os.path.join(path_lrt, newfile)
            with h5py.File(completeName, 'w') as fwrite:
                fwrite.create_dataset('jets', data=jets,compression='gzip', compression_opts=7)
                fwrite.create_dataset('tracks_from_jet', data=tracks,compression='gzip', compression_opts=7)