diff --git a/DataCollection.py b/DataCollection.py index e0098d6..add9c2f 100644 --- a/DataCollection.py +++ b/DataCollection.py @@ -286,7 +286,8 @@ def readRootListFromFile(self,file): lines = [line.rstrip('\n') for line in open(file)] for line in lines: if len(line) < 1: continue - if self.useRelativePaths: + #if self.useRelativePaths: + if not line.startswith('/'): self.originRoots.append(fdir+'/'+line) else: self.originRoots.append(line) @@ -587,13 +588,13 @@ def __collectWriteInfo(successful,samplename,sampleentries,outputDir): lastindex=startindex-1 alldone=False results=[] + import time try: while not alldone: nrunning=0 for runs in processrunning: if runs: nrunning+=1 - for i in range(len(processes)): if nrunning>=nchilds: break @@ -604,8 +605,6 @@ def __collectWriteInfo(successful,samplename,sampleentries,outputDir): processes[i].start() processrunning[i]=True nrunning+=1 - - if not wo_queue.empty(): res=wo_queue.get() @@ -622,11 +621,12 @@ def __collectWriteInfo(successful,samplename,sampleentries,outputDir): for r in results: thisidx=r[0] if thisidx==lastindex+1: - logging.info('>>>> collected result %d of %d' % (thisidx,len(self.originRoots))) + logging.info('>>>> collected result %d of %d' % (thisidx+1,len(self.originRoots))) __collectWriteInfo(r[1][0],r[1][1],r[1][2],outputDir) - lastindex=thisidx + lastindex=thisidx + - if nrunning==0: + if nrunning==0 and lastindex+1 == len(self.originRoots): alldone=True continue time.sleep(0.1) @@ -657,6 +657,9 @@ def getAllLabels(self): def getAllFeatures(self): return self.__stackData(self.dataclass,'x') + + def getAllSpectators(self): + return self.__stackData(self.dataclass,'z') def getAllWeights(self): return self.__stackData(self.dataclass,'w') @@ -681,6 +684,8 @@ def __stackData(self, dataclass, selector): thislist=td.x if selector == 'y': thislist=td.y + if selector == 'z': + thislist=td.z if selector == 'w': thislist=td.w @@ -1019,3 +1024,4 @@ def get(self): + diff --git a/TrainData.py b/TrainData.py index 934b66c..efdbd37 100644 --- a/TrainData.py +++ b/TrainData.py @@ -41,16 +41,16 @@ def fileTimeOut(fileName, timeOut): time.sleep(1) -def _read_arrs_(arrwl,arrxl,arryl,doneVal,fileprefix,tdref=None,randomSeed=None): +def _read_arrs_(arrwl,arrxl,arryl,arrzl,doneVal,fileprefix,tdref=None,randomSeed=None): import gc gc.collect() import h5py from sklearn.utils import shuffle try: - idstrs=['w','x','y'] + idstrs=['w','x','y','z'] h5f = h5py.File(fileprefix,'r') - alllists=[arrwl,arrxl,arryl] + alllists=[arrwl,arrxl,arryl,arrzl] for j in range(len(idstrs)): fidstr=idstrs[j] arl=alllists[j] @@ -149,14 +149,17 @@ def clear(self): if hasattr(self, 'x'): del self.x del self.y + del self.z del self.w if hasattr(self, 'w_list'): del self.w_list del self.x_list del self.y_list + del self.z_list self.x=[numpy.array([])] self.y=[numpy.array([])] + self.z=[numpy.array([])] self.w=[numpy.array([])] self.nsamples=None @@ -252,9 +255,11 @@ def _writeoutArrays(arrlist,fidstr,h5F): _writeoutListinfo(self.w,'w',h5f) _writeoutListinfo(self.x,'x',h5f) + _writeoutListinfo(self.z,'z',h5f) _writeoutListinfo(self.y,'y',h5f) _writeoutArrays(self.w,'w',h5f) + _writeoutArrays(self.z,'z',h5f) _writeoutArrays(self.x,'x',h5f) _writeoutArrays(self.y,'y',h5f) @@ -320,6 +325,7 @@ def _readListInfo_(idstr): sharedlist.append(numpy.array([])) iidstr=idstr+str(i) shapeinfo=numpy.array(self.h5f[iidstr+'_shape']) + #print(shapeinfo) shapeinfos.append(shapeinfo) return sharedlist, shapeinfos @@ -332,14 +338,17 @@ def _readListInfo_(idstr): self.nsamples=self.h5f['n'] self.nsamples=self.nsamples[0] if True or not hasattr(self, 'w_shapes'): + #print("READING LISTS SHAPES") self.w_list,self.w_shapes=_readListInfo_('w') self.x_list,self.x_shapes=_readListInfo_('x') self.y_list,self.y_shapes=_readListInfo_('y') + self.z_list,self.z_shapes=_readListInfo_('z') else: print('\nshape known\n') self.w_list,_=_readListInfo_('w') self.x_list,_=_readListInfo_('x') self.y_list,_=_readListInfo_('y') + self.z_list,_=_readListInfo_('z') self.h5f.close() del self.h5f @@ -368,6 +377,8 @@ def _readListInfo_(idstr): shutil.copyfile(filebase+'x.'+str(i),unique_filename+'.x.'+str(i)) for i in range(len(self.y_list)): shutil.copyfile(filebase+'y.'+str(i),unique_filename+'.y.'+str(i)) + for i in range(len(self.z_list)): + shutil.copyfile(filebase+'z.'+str(i),unique_filename+'.z.'+str(i)) unique_filename+='.meta' else: @@ -379,12 +390,12 @@ def _readListInfo_(idstr): #create shared mem in sync mode for i in range(len(self.w_list)): self.w_list[i]=self.__createArr(self.w_shapes[i]) - for i in range(len(self.x_list)): self.x_list[i]=self.__createArr(self.x_shapes[i]) - for i in range(len(self.y_list)): self.y_list[i]=self.__createArr(self.y_shapes[i]) + for i in range(len(self.z_list)): + self.z_list[i]=self.__createArr(self.z_shapes[i]) if read_async: self.readdone=multiprocessing.Value('b',False) @@ -410,13 +421,18 @@ def _readListInfo_(idstr): filebase+'y.'+str(i), list(self.y_list[i].shape), isRamDisk)) - + for i in range(len(self.z_list)): + self.readthreadids.append(startReading(self.z_list[i].ctypes.data, + filebase+'z.'+str(i), + list(self.z_list[i].shape), + isRamDisk)) else: self.readthread=multiprocessing.Process(target=_read_arrs_, args=(self.w_list, self.x_list, self.y_list, + self.z_list, self.readdone, readfile, self,randomseed)) @@ -441,10 +457,15 @@ def _readListInfo_(idstr): filebase+'y.'+str(i), list(self.y_list[i].shape), isRamDisk)) + for i in range(len(self.z_list)): + (readBlocking(self.z_list[i].ctypes.data, + filebase+'z.'+str(i), + list(self.z_list[i].shape), + isRamDisk)) else: self.readdone=multiprocessing.Value('b',False) - _read_arrs_(self.w_list,self.x_list,self.y_list,self.readdone,readfile,self,randomseed) + _read_arrs_(self.w_list,self.x_list,self.y_list,self.z_list,self.readdone,readfile,self,randomseed) @@ -510,22 +531,25 @@ def readIn_join(self,wasasync=True,waitforStart=True): import copy #move away from shared memory #this costs performance but seems necessary + direct=False with threadingfileandmem_lock: if direct: self.w=self.w_list self.x=self.x_list self.y=self.y_list + self.z=self.z_list else: self.w=copy.deepcopy(self.w_list) self.x=copy.deepcopy(self.x_list) self.y=copy.deepcopy(self.y_list) - + self.z=copy.deepcopy(self.z_list) del self.w_list del self.x_list del self.y_list + del self.z_list + #in case of some errors during read-in - except Exception as d: raise d finally: @@ -538,17 +562,19 @@ def reshape_fast(arr,shapeinfo): arr=arr.reshape(shapeinfo) return arr - for i in range(len(self.w)): self.w[i]=reshape_fast(self.w[i],self.w_shapes[i]) for i in range(len(self.x)): self.x[i]=reshape_fast(self.x[i],self.x_shapes[i]) for i in range(len(self.y)): self.y[i]=reshape_fast(self.y[i],self.y_shapes[i]) - + for i in range(len(self.z)): + self.z[i]=reshape_fast(self.z[i],self.z_shapes[i]) + self.w_list=None self.x_list=None self.y_list=None + self.z_list=None if wasasync and self.readthread: self.readthread.terminate() self.readthread=None @@ -561,6 +587,7 @@ def readIn(self,fileprefix,shapesOnly=False): self.w=self.w_list self.x=self.x_list self.y=self.y_list + self.z=self.z_list else: import copy self.w=copy.deepcopy(self.w_list) @@ -569,6 +596,8 @@ def readIn(self,fileprefix,shapesOnly=False): del self.x_list self.y=copy.deepcopy(self.y_list) del self.y_list + self.z=copy.deepcopy(self.z_list) + del self.z_list def reshape_fast(arr,shapeinfo): if len(shapeinfo)<2: @@ -588,10 +617,12 @@ def reshape_fast(arr,shapeinfo): self.x[i]=reshape_fast(self.x[i],self.x_shapes[i]) for i in range(len(self.y)): self.y[i]=reshape_fast(self.y[i],self.y_shapes[i]) - + for i in range(len(self.z)): + self.z[i]=reshape_fast(self.z[i],self.z_shapes[i]) self.w_list=None self.x_list=None self.y_list=None + self.z_list=None self.readthread=None @@ -700,6 +731,7 @@ def _normalize_input_(self, weighter, npy_array): print('remove') self.x = [x[notremoves > 0] for x in self.x] self.y = [y[notremoves > 0] for y in self.y] + self.z = [z[notremoves > 0] for z in self.z] weights=weights[notremoves > 0] self.w = [weights for _ in self.y] newnsamp=self.x[0].shape[0] diff --git a/Weighter.py b/Weighter.py index f47d5fc..5ad29b6 100644 --- a/Weighter.py +++ b/Weighter.py @@ -221,8 +221,7 @@ def createNotRemoveIndices(self,Tuple): xaverage[index]+=jet[self.nameX] yaverage[index]+=jet[self.nameY] norm[index]+=1 - - counter=counter+1 + counter=counter+1 if not len(notremove) == counter: diff --git a/preprocessing/preprocessing.py b/preprocessing/preprocessing.py index 56fc0de..8338ff6 100644 --- a/preprocessing/preprocessing.py +++ b/preprocessing/preprocessing.py @@ -486,6 +486,22 @@ def MeanNormZeroPadParticles(Filename_in,MeanNormTuple,inbranches,nMax,nevents): return array +def ZeroPadParticles(Filename_in,MeanNormTuple,inbranches,nMax,nevents): + from DeepJetCore.compiled import c_meanNormZeroPad + + array = numpy.zeros((nevents,nMax,len(inbranches)) , dtype='float32') + + means=[] + norms=[] + for b in inbranches: + means.append(0.) + norms.append(1.) + + c_meanNormZeroPad.particlecluster(array,[norms],[means],[inbranches],[nMax],Filename_in) + + return array + + def MeanNormZeroPad(Filename_in,MeanNormTuple,inbranches_listlist,nMaxslist,nevents): """ diff --git a/training/training_base.py b/training/training_base.py index db36be8..5aad2b0 100644 --- a/training/training_base.py +++ b/training/training_base.py @@ -52,14 +52,17 @@ def __init__( parser=None ): - if parser is None: parser = ArgumentParser('Run the training') - parser.add_argument('inputDataCollection') - parser.add_argument('outputDir') - parser.add_argument('--modelMethod', help='Method to be used to instantiate model in derived training class', metavar='OPT', default=None) - parser.add_argument("--gpu", help="select specific GPU", type=int, metavar="OPT", default=-1) - parser.add_argument("--gpufraction", help="select memory fraction for GPU", type=float, metavar="OPT", default=-1) - - args = parser.parse_args() + if parser is None: + parser = ArgumentParser('Run the training') + parser.add_argument('inputDataCollection') + parser.add_argument('outputDir') + parser.add_argument('--modelMethod', help='Method to be used to instantiate model in derived training class', metavar='OPT', default=None) + parser.add_argument("--gpu", help="select specific GPU", type=int, metavar="OPT", default=-1) + parser.add_argument("--gpufraction", help="select memory fraction for GPU", type=float, metavar="OPT", default=-1) + + args = parser.parse_args() + else: + args=parser self.args = args import os @@ -117,10 +120,14 @@ def __init__( isNewTraining=True if os.path.isdir(self.outputDir): if not resumeSilently: - var = raw_input('output dir exists. To recover a training, please type "yes"\n') - if not var == 'yes': + var = raw_input('Output dir exists. To recover a training, please type "yes"\n To overwrite a training, please type "continue"\n') + if var == 'yes': + isNewTraining=False + elif var == 'continue': + shutil.rmtree(self.outputDir) + else: raise Exception('output directory must not exists yet') - isNewTraining=False + #isNewTraining=False else: os.mkdir(self.outputDir) self.outputDir = os.path.abspath(self.outputDir)