I have an hdf5 file of about 735MB that has the structure as mentioned below. I have to filter out my data corresponding to a certain criteria, however, I am facing an operational problem. The dataset tracks_from_jet
has certain variables that I want to place my selection cuts on (let's say I want to select one variable >= 500), and I have to remove those corresponding records/data by index that do not satisfy the criteria (corresponding to variable >=500) in the jets
dataset.
The tracks_from_jet
dataset's first index has a 1-1 correspondence with the jets
index. How do I remove a record from the jets
dataset corresponding to my selection criteria in the tracks_from_jet
dataset?
The h5 file structure is:
jets Dataset {679015/Inf}
Location: 1:800
Links: 1
Chunks: {2048} 671744 bytes
Storage: 222716920 logical bytes, 110070578 allocated bytes, 202.34% utilization
Filter-0: deflate-1 OPT {7}
Type: struct {
"pt_btagJes" +0 native float
"eta_btagJes" +4 native float
"absEta_btagJes" +8 native float
"JetFitter_energyFraction" +12 native float
"JetFitter_mass" +16 native float
"JetFitter_significance3d" +20 native float
"JetFitter_deltaphi" +24 native float
"JetFitter_deltaeta" +28 native float
"JetFitter_massUncorr" +32 native float
"JetFitter_dRFlightDir" +36 native float
"SV1_masssvx" +40 native float
"SV1_efracsvx" +44 native float
"SV1_significance3d" +48 native float
"SV1_correctSignificance3d" +52 native float
"SV1_dstToMatLay" +56 native float
"SV1_deltaR" +60 native float
"SV1_Lxy" +64 native float
"SV1_L3d" +68 native float
"JetFitter_deltaR" +72 native float
"JetFitterSecondaryVertex_displacement3d" +76 native float
"JetFitterSecondaryVertex_displacement2d" +80 native float
"JetFitterSecondaryVertex_mass" +84 native float
"JetFitterSecondaryVertex_energy" +88 native float
"JetFitterSecondaryVertex_energyFraction" +92 native float
"JetFitterSecondaryVertex_minimumTrackRelativeEta" +96 native float
"JetFitterSecondaryVertex_maximumTrackRelativeEta" +100 native float
"JetFitterSecondaryVertex_averageTrackRelativeEta" +104 native float
"JetFitterSecondaryVertex_maximumAllJetTrackRelativeEta" +108 native float
"JetFitterSecondaryVertex_minimumAllJetTrackRelativeEta" +112 native float
"JetFitterSecondaryVertex_averageAllJetTrackRelativeEta" +116 native float
"IP2D_pu" +120 native float
"IP2D_pc" +124 native float
"IP2D_pb" +128 native float
"IP3D_pu" +132 native float
"IP3D_pc" +136 native float
"IP3D_pb" +140 native float
"IP2D_cu" +144 native float
"IP2D_bu" +148 native float
"IP2D_bc" +152 native float
"IP3D_cu" +156 native float
"IP3D_bu" +160 native float
"IP3D_bc" +164 native float
"rnnip_pu" +168 native float
"rnnip_pc" +172 native float
"rnnip_pb" +176 native float
"DL1r_pu" +180 native float
"DL1r_pc" +184 native float
"DL1r_pb" +188 native float
"IP2D_isDefaults" +192 native int
"IP3D_isDefaults" +196 native int
"JetFitter_isDefaults" +200 native int
"SV1_isDefaults" +204 native int
"JetFitterSecondaryVertex_isDefaults" +208 native int
"rnnip_isDefaults" +212 native int
"JetFitter_nVTX" +216 native float
"JetFitter_nSingleTracks" +220 native float
"JetFitter_nTracksAtVtx" +224 native float
"JetFitter_N2Tpair" +228 native float
"SV1_N2Tpair" +232 native float
"SV1_NGTinSvx" +236 native float
"JetFitterSecondaryVertex_nTracks" +240 native float
"IP2D_nTrks" +244 native float
"IP3D_nTrks" +248 native float
"pt" +252 native float
"eta" +256 native float
"energy" +260 native float
"mass" +264 native float
"GhostBHadronsFinalPt" +268 native float
"bTagJVT" +272 native float
"GhostBHadronsFinalCount" +276 native int
"GhostCHadronsFinalCount" +280 native int
"HadronConeExclTruthLabelID" +284 native int
"HadronConeExclExtendedTruthLabelID" +288 native int
"PartonTruthLabelID" +292 native int
"jetPtRank" +296 native int
"mcEventWeight" +300 native float
"eventNumber" +304 native long
"averageInteractionsPerCrossing" +312 native float
"actualInteractionsPerCrossing" +316 native float
"nPrimaryVertices" +320 native int
"beamSpotWeight" +324 native float
} 328 bytes
tracks_from_jet Dataset {679015/Inf, 40/40}
Location: 1:7832
Links: 1
Chunks: {2048, 40} 9338880 bytes
Storage: 3096308400 logical bytes, 661050378 allocated bytes, 468.39% utilization
Filter-0: deflate-1 OPT {7}
Type: struct {
"chiSquared" +0 native float
"numberDoF" +4 native float
"radiusOfFirstHit" +8 native float
"IP3D_signed_d0" +12 native float
"IP2D_signed_d0" +16 native float
"IP3D_signed_z0" +20 native float
"theta" +24 native float
"qOverP" +28 native float
"numberOfInnermostPixelLayerHits" +32 native unsigned char
"numberOfNextToInnermostPixelLayerHits" +33 native unsigned char
"numberOfInnermostPixelLayerSharedHits" +34 native unsigned char
"numberOfInnermostPixelLayerSplitHits" +35 native unsigned char
"numberOfPixelHits" +36 native unsigned char
"numberOfPixelHoles" +37 native unsigned char
"numberOfPixelSharedHits" +38 native unsigned char
"numberOfPixelSplitHits" +39 native unsigned char
"numberOfSCTHits" +40 native unsigned char
"numberOfSCTHoles" +41 native unsigned char
"numberOfSCTSharedHits" +42 native unsigned char
"expectNextToInnermostPixelLayerHit" +43 native unsigned char
"expectInnermostPixelLayerHit" +44 native unsigned char
"d0" +45 native float
"z0SinTheta" +49 native float
"d0Uncertainty" +53 native float
"z0SinThetaUncertainty" +57 native float
"IP3D_signed_d0_significance" +61 native float
"IP3D_signed_z0_significance" +65 native float
"pt" +69 native float
"eta" +73 native float
"phiUncertainty" +77 native float
"thetaUncertainty" +81 native float
"qOverPUncertainty" +85 native float
"deta" +89 native float
"dphi" +93 native float
"dr" +97 native float
"ptfrac" +101 native float
"z0RelativeToBeamspot" +105 native float
"z0RelativeToBeamspotUncertainty" +109 native float
"valid" +113 enum native signed char {
TRUE = 1
FALSE = 0
}
} 114 bytes
So here's a prototype that works. It's not memory efficient, but on a system (say i9, 5.3GHz with 24GB NVIDIA RTX-3090 GPU Mem and 32GB RAM), it takes about 30 seconds to achieve each iteration. It does, however, crash on regular laptop resources.
import h5py
import numpy as np
import os
import glob
#directory=os.getcwd()
results = glob.glob('std/*output.h5',recursive=True)
#results = glob.glob('lrt/*output.h5',recursive=True)
#print(results)
path_std = '../ttbar/std/'
path_lrt = '../ttbar/lrt/'
isExist_std = os.path.exists(path_std)
if not isExist_std:
os.makedirs(path_std)
print("The new std-directory is created!\n")
isExist_lrt = os.path.exists(path_lrt)
if not isExist_lrt:
os.makedirs(path_lrt)
print("The new lrt-directory is created!\n")
for f in results:
print("Running cuts on:"+str(f)+"\n")
with h5py.File(f, "r") as h5r:
key_jets = list(h5r.keys())[0]
key_tracks = list(h5r.keys())[1]
# print (key_jets, h5r[key_jets].shape, h5r[key_jets].dtype)
# print (key_tracks, h5r[key_tracks].shape, h5r[key_tracks].dtype)
jets = np.array(h5r['jets'])
print(jets.shape)
tracks = np.array(h5r['tracks_from_jet'])
print(tracks.shape)
print(tracks.shape[0])
print(tracks.shape[1])
#print(tracks.shape[2])
tracks_d0 = h5r['tracks_from_jet']['d0']
tracks_z0 = h5r['tracks_from_jet']['z0SinTheta']
npy_tracks_d0 = np.array(tracks_d0)
npy_tracks_z0 = np.array(tracks_z0)
# print(npy_jets)
# print(npy_tracks_d0)
# print(npy_tracks_z0)
sol_d0 = np.argwhere(abs(npy_tracks_d0) > 1)
sol_z0 = np.argwhere(abs(npy_tracks_z0) > 1.5)
sold0 = sol_d0[0]
solz0 = sol_z0[0]
indices_to_remove = np.unique(np.concatenate((sold0, solz0)))
if indices_to_remove.size == 0:
continue
print(indices_to_remove)
#for i in indices_to_remove:
# print(jets[i])
#for j in indices_to_remove:
# print(tracks[j][:][:])
# print('====================================================================================================================================================================')
jets = np.delete(jets, indices_to_remove, axis=0)
tracks = np.delete(tracks, indices_to_remove, axis=0)
newfile = 'refined' + os.path.basename(f)
completeName = os.path.join(path_std, newfile)
#completeName = os.path.join(path_lrt, newfile)
with h5py.File(completeName, 'w') as fwrite:
fwrite.create_dataset('jets', data=jets,compression='gzip', compression_opts=7)
fwrite.create_dataset('tracks_from_jet', data=tracks,compression='gzip', compression_opts=7)