diff --git a/CMakeLists.txt b/CMakeLists.txt index bf1d83fbb6644..feabbddfcf0f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,9 @@ project(O2 LANGUAGES C CXX VERSION 1.2.0) include(CTest) +set(CUDA_COMPUTETARGET 75) +set(GPUCA_NO_FAST_MATH 1) + # Project wide setup # Would better fit inside GPU/CMakeLists.txt, but include GPU/Common directly diff --git a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h index 49d61007092a8..c5e103362f0f4 100644 --- a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h +++ b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h @@ -156,6 +156,17 @@ struct ClusterNative { return (this->getFlags() < rhs.getFlags()); } } + + GPUd() bool operator==(const ClusterNative& rhs) const + { + return this->getTimePacked() == rhs.getTimePacked() && + this->padPacked == rhs.padPacked && + this->sigmaTimePacked == rhs.sigmaTimePacked && + this->sigmaPadPacked == rhs.sigmaPadPacked && + this->qMax == rhs.qMax && + this->qTot == rhs.qTot && + this->getFlags() == rhs.getFlags(); + } }; // This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index 6e08e9582761a..a3232b454d5ba 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -18,8 +18,10 @@ # HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers # for installation only +#set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... + if(NOT DEFINED GPUCA_NO_FAST_MATH) - set(GPUCA_NO_FAST_MATH 0) + set(GPUCA_NO_FAST_MATH 1) endif() if(${GPUCA_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math -ffp-contract=off") diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h index b9b93100d5bfd..d18e7863e2001 100644 --- a/GPU/GPUTracking/Base/GPUConstantMem.h +++ b/GPU/GPUTracking/Base/GPUConstantMem.h @@ -46,6 +46,7 @@ class GPUTRDTracker_t #if defined(GPUCA_NOCOMPAT_ALLCINT) && (!defined(GPUCA_GPULIBRARY) || !defined(GPUCA_ALIROOT_LIB)) && defined(GPUCA_HAVE_O2HEADERS) #include "GPUTPCConvert.h" #include "GPUTPCCompression.h" +#include "GPUTPCDecompression.h" #include "GPUITSFitter.h" #include "GPUTPCClusterFinder.h" #include "GPUTrackingRefit.h" @@ -69,6 +70,7 @@ struct GPUConstantMem { tpcTrackers[GPUCA_NSLICES]; GPUTPCConvert tpcConverter; GPUTPCCompression tpcCompressor; + GPUTPCDecompression tpcDecompressor; GPUTPCGMMerger tpcMerger; GPUTRDTrackerGPU trdTrackerGPU; #ifdef GPUCA_HAVE_O2HEADERS diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 8b56e46888e9b..518f0af43aae4 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -511,7 +511,7 @@ inline void GPUReconstruction::RegisterGPUProcessor(T* proc, bool deviceSlave) template inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate) { - static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derrived class"); + static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class"); if (allocate) { proc->SetMaxData(mHostConstantMem->ioPtrs); } diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 5b176218984e5..97642734a60b7 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -39,6 +39,7 @@ #include "GPUITSFitterKernels.h" #include "GPUTPCConvertKernel.h" #include "GPUTPCCompressionKernels.h" +#include "GPUTPCDecompressionKernels.h" #include "GPUTPCClusterFinderKernels.h" #include "GPUTrackingRefitKernel.h" #include "GPUTPCGMO2Output.h" diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h index 0297a1a6cb9d1..81fde1dbfb996 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h +++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h @@ -75,6 +75,9 @@ using namespace GPUCA_NAMESPACE::gpu; #include "GPUTPCCompressionKernels.cxx" #include "GPUTPCCompressionTrackModel.cxx" +//Files for TPC Decompression +#include "GPUTPCDecompressionKernels.cxx" + // Files for TPC Cluster Finder #include "ClusterAccumulator.cxx" #include "GPUTPCCFStreamCompaction.cxx" diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 0811bb6d1ad79..9973720250ad8 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -11,7 +11,7 @@ set(MODULE GPUTracking) -# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -O0") # to uncomment if needed, tired of typing this... +# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... include(cmake/helpers.cmake) @@ -176,6 +176,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS) DataCompression/GPUTPCCompression.cxx DataCompression/GPUTPCCompressionTrackModel.cxx DataCompression/GPUTPCCompressionKernels.cxx + DataCompression/GPUTPCDecompression.cxx + DataCompression/GPUTPCDecompressionKernels.cxx DataCompression/TPCClusterDecompressor.cxx DataCompression/GPUTPCClusterStatistics.cxx TPCClusterFinder/GPUTPCClusterFinder.cxx @@ -469,6 +471,7 @@ endif() set_source_files_properties(DataCompression/GPUTPCCompressionTrackModel.cxx DataCompression/GPUTPCCompressionKernels.cxx DataCompression/TPCClusterDecompressor.cxx + DataCompression/GPUTPCDecompressionKernels.cxx TARGET_DIRECTORY ${targetName} PROPERTIES COMPILE_FLAGS "-fno-fast-math -ffp-contract=off") diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx new file mode 100644 index 0000000000000..c70422a206bf2 --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -0,0 +1,94 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompression.cxx +/// \author Gabriele Cimador + +#include "GPUTPCDecompression.h" +#include "GPUTPCCompression.h" +#include "GPUReconstruction.h" +#include "GPUO2DataTypes.h" +#include "GPUMemorySizeScalers.h" +#include "GPULogging.h" + +using namespace GPUCA_NAMESPACE::gpu; + +void GPUTPCDecompression::InitializeProcessor() {} + +void* GPUTPCDecompression::SetPointersInputGPU(void* mem) +{ + SetPointersCompressedClusters(mem, mInputGPU, mInputGPU.nAttachedClusters, mInputGPU.nTracks, mInputGPU.nUnattachedClusters, true); + return mem; +} + +template +void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA) +{ + computePointerWithAlignment(mem, c.qTotU, nClU); // Do not reorder, qTotU ist used as first address in GPUChainTracking::RunTPCCompression + computePointerWithAlignment(mem, c.qMaxU, nClU); + computePointerWithAlignment(mem, c.flagsU, nClU); + computePointerWithAlignment(mem, c.padDiffU, nClU); + computePointerWithAlignment(mem, c.timeDiffU, nClU); + computePointerWithAlignment(mem, c.sigmaPadU, nClU); + computePointerWithAlignment(mem, c.sigmaTimeU, nClU); + computePointerWithAlignment(mem, c.nSliceRowClusters, GPUCA_ROW_COUNT * NSLICES); + + unsigned int nClAreduced = reducedClA ? nClA - nTr : nClA; + + if (!(mRec->GetParam().rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) { + return; // Track model disabled, do not allocate memory + } + computePointerWithAlignment(mem, c.qTotA, nClA); + computePointerWithAlignment(mem, c.qMaxA, nClA); + computePointerWithAlignment(mem, c.flagsA, nClA); + computePointerWithAlignment(mem, c.rowDiffA, nClAreduced); + computePointerWithAlignment(mem, c.sliceLegDiffA, nClAreduced); + computePointerWithAlignment(mem, c.padResA, nClAreduced); + computePointerWithAlignment(mem, c.timeResA, nClAreduced); + computePointerWithAlignment(mem, c.sigmaPadA, nClA); + computePointerWithAlignment(mem, c.sigmaTimeA, nClA); + + computePointerWithAlignment(mem, c.qPtA, nTr); + computePointerWithAlignment(mem, c.rowA, nTr); + computePointerWithAlignment(mem, c.sliceA, nTr); + computePointerWithAlignment(mem, c.timeA, nTr); + computePointerWithAlignment(mem, c.padA, nTr); + + computePointerWithAlignment(mem, c.nTrackClusters, nTr); +} + +void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){ + computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer); + return mem; +} + +void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem){ + computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT); + return mem; +} + +void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem){ + computePointerWithAlignment(mem,mUnattachedClustersOffsets,NSLICES * GPUCA_ROW_COUNT); + return mem; +} + +void GPUTPCDecompression::RegisterMemoryAllocation() { + AllocateAndInitializeLate(); + mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput"); + mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersGPU,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffersGPU"); + mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersOutput,GPUMemoryResource::MEMORY_OUTPUT,"TPCDecompressionTmpBuffersOutput"); + mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersInput,GPUMemoryResource::MEMORY_INPUT,"TPCDecompressionTmpBuffersInput"); +} + +void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){ + //mMaxNativeClustersPerBuffer = 81760; + mMaxNativeClustersPerBuffer = 12000; +} \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h new file mode 100644 index 0000000000000..d2cb749dd79c3 --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -0,0 +1,79 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompression.h +/// \author Gabriele Cimador + +#ifndef GPUTPCDECOMPRESSION_H +#define GPUTPCDECOMPRESSION_H + +#include "GPUDef.h" +#include "GPUProcessor.h" +#include "GPUCommonMath.h" +#include "GPUParam.h" +#include "GPUO2DataTypes.h" + +#ifdef GPUCA_HAVE_O2HEADERS +#include "DataFormatsTPC/CompressedClusters.h" +#else +namespace o2::tpc +{ +struct CompressedClustersPtrs { +}; +struct CompressedClusters { +}; +struct CompressedClustersFlat { +}; +} // namespace o2::tpc +#endif + +namespace GPUCA_NAMESPACE::gpu +{ + +class GPUTPCDecompression : public GPUProcessor +{ + friend class GPUTPCDecompressionKernels; + friend class GPUChainTracking; + + public: +#ifndef GPUCA_GPUCODE + void InitializeProcessor(); + void RegisterMemoryAllocation(); + void SetMaxData(const GPUTrackingInOutPointers& io); + + void* SetPointersInputGPU(void* mem); + void* SetPointersTmpNativeBuffersGPU(void* mem); + void* SetPointersTmpNativeBuffersOutput(void* mem); + void* SetPointersTmpNativeBuffersInput(void* mem); + +#endif + + protected: + constexpr static unsigned int NSLICES = GPUCA_NSLICES; + o2::tpc::CompressedClusters mInputGPU; + + unsigned int mMaxNativeClustersPerBuffer; + unsigned int* mNativeClustersIndex; + unsigned int* mUnattachedClustersOffsets; + o2::tpc::ClusterNative* mTmpNativeClusters; + o2::tpc::ClusterNativeAccess* mClusterNativeAccess; + o2::tpc::ClusterNative* mNativeClustersBuffer; + + template + void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); + + short mMemoryResInputGPU = -1; + short mResourceTmpIndexes = -1; + short mResourceTmpClustersOffsets = -1; + +}; +} // namespace GPUCA_NAMESPACE::gpu +#endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx new file mode 100644 index 0000000000000..1a726d00ebcd2 --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -0,0 +1,201 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompressionKernels.cxx +/// \author Gabriele Cimador + +#include "GPUTPCDecompressionKernels.h" +#include "GPULogging.h" +#include "GPUConstantMem.h" +#include "GPUTPCCompressionTrackModel.h" +#include "GPUCommonAlgorithm.h" +#include + +using namespace GPUCA_NAMESPACE::gpu; +using namespace o2::tpc; + +template <> +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; + const GPUParam& GPUrestrict() param = processors.param; + + unsigned int offset = 0, lasti = 0; + const unsigned int maxTime = (param.par.continuousMaxTimeBin + 1) * ClusterNative::scaleTimePacked - 1; + + for (unsigned int i = get_global_id(0); i < cmprClusters.nTracks; i += get_global_size(0)) { + while (lasti < i) { + offset += cmprClusters.nTrackClusters[lasti++]; + } + lasti++; + decompressTrack(cmprClusters, param, maxTime, i, offset, decompressor); + } +} + +GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor){ + float zOffset = 0; + unsigned int slice = cmprClusters.sliceA[trackIndex]; + unsigned int row = cmprClusters.rowA[trackIndex]; + GPUTPCCompressionTrackModel track; + unsigned int clusterIndex; + for(clusterIndex = 0; clusterIndex < cmprClusters.nTrackClusters[trackIndex]; clusterIndex++){ + unsigned int pad = 0, time = 0; + if(clusterIndex != 0){ + unsigned char tmpSlice = cmprClusters.sliceLegDiffA[clusterOffset - trackIndex -1]; + bool changeLeg = (tmpSlice >= GPUCA_NSLICES); + if(changeLeg){ + tmpSlice -= GPUCA_NSLICES; + } + if(cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences){ + slice += tmpSlice; + if(slice >= GPUCA_NSLICES){ + slice -= GPUCA_NSLICES; + } + row += cmprClusters.rowDiffA[clusterOffset -trackIndex -1]; + if(row >= GPUCA_ROW_COUNT){ + row -= GPUCA_ROW_COUNT; + } + } else { + slice = tmpSlice; + row = cmprClusters.rowDiffA[clusterOffset -trackIndex -1]; + } + if (changeLeg && track.Mirror()) { + break; + } + if (track.Propagate(param.tpcGeometry.Row2X(row),param.SliceParam[slice].Alpha)){ + break; + } + unsigned int timeTmp = cmprClusters.timeResA[clusterOffset -trackIndex -1]; + if (timeTmp & 800000) { + timeTmp |= 0xFF000000; + } + time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f,param.tpcGeometry.LinearZ2Time(slice,track.Z() + zOffset))); + float tmpPad = CAMath::Max(0.f, CAMath::Min((float)param.tpcGeometry.NPads(GPUCA_ROW_COUNT - 1), param.tpcGeometry.LinearY2Pad(slice, row, track.Y()))); + pad = cmprClusters.padResA[clusterOffset -trackIndex - 1] + ClusterNative::packPad(tmpPad); + time = time & 0xFFFFFF; + pad = (unsigned short)pad; + if (pad >= param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked) { + if (pad >= 0xFFFF - 11968) { // Constant 11968 = (2^15 - MAX_PADS(138) * scalePadPacked(64)) / 2 + pad = 0; + } else { + pad = param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked - 1; + } + } + if (param.par.continuousMaxTimeBin > 0 && time >= maxTime) { + if (time >= 0xFFFFFF - 544768) { // Constant 544768 = (2^23 - LHCMAXBUNCHES(3564) * MAXORBITS(256) * scaleTimePacked(64) / BCPERTIMEBIN(8)) / 2) + time = 0; + } else { + time = maxTime; + } + } + } else { + time = cmprClusters.timeA[trackIndex]; + pad = cmprClusters.padA[trackIndex]; + } + bool stored; + const auto cluster = decompressTrackStore(cmprClusters, clusterOffset, slice, row, pad, time, decompressor, stored); + float y = param.tpcGeometry.LinearPad2Y(slice, row, cluster.getPad()); + float z = param.tpcGeometry.LinearTime2Z(slice, cluster.getTime()); + if(clusterIndex == 0){ + zOffset = z; + track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex],param); + } + if(clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y,z-zOffset,row)){ + break; + } + clusterOffset++; + } + clusterOffset += cmprClusters.nTrackClusters[trackIndex] - clusterIndex; +} + +GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored){ + unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer); + unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row),1u); + const ClusterNative c(time, cmprClusters.flagsA[clusterOffset], pad, cmprClusters.sigmaTimeA[clusterOffset], cmprClusters.sigmaPadA[clusterOffset], cmprClusters.qMaxA[clusterOffset], cmprClusters.qTotA[clusterOffset]); + stored = currentClusterIndex < decompressor.mMaxNativeClustersPerBuffer; + if(stored){ + decompressor.mTmpNativeClusters[tmpBufferIndex + currentClusterIndex] = c; + } + return c; +} + +template <> +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; + ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer; + const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative; + + unsigned int* offsets = decompressor.mUnattachedClustersOffsets; + for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){ + unsigned int slice = i / GPUCA_ROW_COUNT; + unsigned int row = i % GPUCA_ROW_COUNT; + unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer); + ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[slice][row]; + if (decompressor.mNativeClustersIndex[i] != 0) { + memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0])); + } + ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i]; + unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]); + decompressHits(cmprClusters, offsets[i], end, clout); + if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) { + for (unsigned int k = 0; k < outputAccess->nClusters[slice][row]; k++) { + auto& cl = buffer[k]; + float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins; + if (t < 0) { + t = 0; + } + if (processors.param.par.continuousMaxTimeBin > 0 && t > processors.param.par.continuousMaxTimeBin) { + t = processors.param.par.continuousMaxTimeBin; + } + cl.setTime(t); + } + } + GPUCommonAlgorithm::sort(buffer, buffer + outputAccess->nClusters[slice][row]); + } + +} + +GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer){ + unsigned int time = 0; + unsigned short pad = 0; + for (unsigned int k = start; k < end; k++) { + if (cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences) { + unsigned int timeTmp = cmprClusters.timeDiffU[k]; + if (timeTmp & 800000) { + timeTmp |= 0xFF000000; + } + time += timeTmp; + pad += cmprClusters.padDiffU[k]; + } else { + time = cmprClusters.timeDiffU[k]; + pad = cmprClusters.padDiffU[k]; + } + *(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]); + } +} + +/* +template <> +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ + for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){ + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; + o2::tpc::ClusterNativeAccess& clustersNative = *decompressor.mClusterNativeAccess; + unsigned int slice = i / GPUCA_ROW_COUNT; + unsigned int row = i % GPUCA_ROW_COUNT; + unsigned int unattachedOffset = (i >= cmprClusters.nSliceRows) ? 0 : cmprClusters.nSliceRowClusters[i]; + (clustersNative.nClusters)[slice][row] = decompressor.mNativeClustersIndex[i] + unattachedOffset; + for(unsigned int j = i+1; j < GPUCA_NSLICES * GPUCA_ROW_COUNT; j++){ + CAMath::AtomicAdd(decompressor.mUnattachedClustersOffsets + j,unattachedOffset); + } + } +}*/ diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h new file mode 100644 index 0000000000000..82041d05fcb7c --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -0,0 +1,59 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompressionKernels.h +/// \author Gabriele Cimador + +#ifndef GPUTPCDECOMPRESSIONKERNELS_H +#define GPUTPCDECOMPRESSIONKERNELS_H + +#include "GPUGeneralKernels.h" +#include "GPUO2DataTypes.h" +#include "GPUParam.h" +#include "GPUConstantMem.h" + +#ifdef GPUCA_HAVE_O2HEADERS +#include "DataFormatsTPC/CompressedClusters.h" +#else +namespace o2::tpc +{ +struct CompressedClusters { +}; +} // namespace o2::tpc +#endif + +namespace GPUCA_NAMESPACE::gpu +{ + +class GPUTPCDecompressionKernels : public GPUKernelTemplate +{ + public: + GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCDecompression; } + + enum K : int { + step0attached = 0, + step1unattached = 1, + //prepareAccess = 2 + }; + + template + GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors); + GPUd() static void decompressTrack(o2::tpc::CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor); + GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored); + GPUdi() static void decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, o2::tpc::ClusterNative* clusterNativeBuffer); + + GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){ + return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer; + } +}; + +} +#endif // GPUTPCDECOMPRESSIONKERNELS_H diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index 07f9035329847..395bdec043331 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -71,6 +71,8 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -135,6 +137,8 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 192, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -198,7 +202,9 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256 #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 3 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 448 @@ -262,7 +268,9 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256 #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -322,6 +330,12 @@ #ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 256 #endif + #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step0attached + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 256 + #endif + #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step1unattached + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 256 + #endif #ifndef GPUCA_LB_GPUTPCCFDecodeZS #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4 #endif diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 403aa0dae2800..1a0aba3533b4f 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -105,6 +105,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors() if (GetRecoSteps() & RecoStep::TPCCompression) { mRec->RegisterGPUProcessor(&processors()->tpcCompressor, GetRecoStepsGPU() & RecoStep::TPCCompression); } + if (GetRecoSteps() & RecoStep::TPCDecompression) { + mRec->RegisterGPUProcessor(&processors()->tpcDecompressor, GetRecoStepsGPU() & RecoStep::TPCDecompression); + } if (GetRecoSteps() & RecoStep::TPCClusterFinding) { for (unsigned int i = 0; i < NSLICES; i++) { mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding); @@ -149,6 +152,9 @@ void GPUChainTracking::RegisterGPUProcessors() if (GetRecoStepsGPU() & RecoStep::TPCCompression) { mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcCompressor, &processors()->tpcCompressor); } + if (GetRecoStepsGPU() & RecoStep::TPCDecompression) { + mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcDecompressor, &processors()->tpcDecompressor); + } if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) { for (unsigned int i = 0; i < NSLICES; i++) { mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 3c622751acad1..e0a4ba3ccd8a2 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -44,7 +44,6 @@ int GPUChainTracking::RunTPCCompression() } SetupGPUProcessor(&Compressor, true); new (Compressor.mMemory) GPUTPCCompression::memory; - WriteToConstantMemory(myStep, (char*)&processors()->tpcCompressor - (char*)processors(), &CompressorShadow, sizeof(CompressorShadow), 0); TransferMemoryResourcesToGPU(myStep, &Compressor, 0); runKernel(GetGridAutoStep(0, RecoStep::TPCCompression), krnlRunRangeNone, krnlEventNone, CompressorShadow.mClusterStatus, Compressor.mMaxClusters * sizeof(CompressorShadow.mClusterStatus[0])); @@ -84,7 +83,6 @@ int GPUChainTracking::RunTPCCompression() SynchronizeStream(OutputStream()); // Synchronize output copies running in parallel from memory that might be released, only the following async copy from stacked memory is safe after the chain finishes. outputStream = OutputStream(); } - if (ProcessingSettings().tpcCompressionGatherMode >= 2) { if (ProcessingSettings().tpcCompressionGatherMode == 2) { void* devicePtr = mRec->getGPUPointer(Compressor.mOutputFlat); @@ -208,11 +206,125 @@ int GPUChainTracking::RunTPCCompression() int GPUChainTracking::RunTPCDecompression() { #ifdef GPUCA_HAVE_O2HEADERS + // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); + ClusterNativeAccess* original = new ClusterNativeAccess; + original->clustersLinear = new ClusterNative[mIOPtrs.clustersNative->nClustersTotal]; + memcpy((void*)original->clustersLinear,mIOPtrs.clustersNative->clustersLinear,mIOPtrs.clustersNative->nClustersTotal*sizeof(mIOPtrs.clustersNative->clustersLinear[0])); + memcpy((void*)original->nClusters,mIOPtrs.clustersNative->nClusters,NSLICES*GPUCA_ROW_COUNT*sizeof(mIOPtrs.clustersNative->nClusters[0][0])); + original->setOffsetPtrs(); + + RecoStep myStep = RecoStep::TPCDecompression; + bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true + GPUTPCDecompression& Decompressor = processors()->tpcDecompressor; + GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor; const auto& threadContext = GetThreadContext(); + + CompressedClusters cmprClsHost = *mIOPtrs.tpcCompressedClusters; + CompressedClusters& inputGPU = Decompressor.mInputGPU; + inputGPU.nAttachedClusters = cmprClsHost.nAttachedClusters; + inputGPU.nUnattachedClusters = cmprClsHost.nUnattachedClusters; + inputGPU.nTracks = cmprClsHost.nTracks; + inputGPU.nAttachedClustersReduced = inputGPU.nAttachedClusters - inputGPU.nTracks; + inputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT; + inputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask; + inputGPU.solenoidBz = param().bzkG; + inputGPU.maxTimeBin = param().par.continuousMaxTimeBin; + SetupGPUProcessor(&Decompressor, true); + + size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU); + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); + + int outputStream = 0; + bool toGPU = true; + CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; + SynchronizeStream(outputStream); + + GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, cmprClsHost.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, toGPU); + + GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.flagsU, cmprClsHost.flagsU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.flagsU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU, cmprClsHost.padDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.padDiffU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU, cmprClsHost.timeDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.timeDiffU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU); + + GPUMemCpyAlways(myStep, inputGPUShadow.qTotA, cmprClsHost.qTotA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qTotA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA, cmprClsHost.qMaxA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qMaxA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.flagsA, cmprClsHost.flagsA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.flagsA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA, cmprClsHost.rowDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA, cmprClsHost.sliceLegDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.padResA, cmprClsHost.padResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.padResA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.timeResA, cmprClsHost.timeResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.timeResA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA, cmprClsHost.sigmaPadA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA, cmprClsHost.sigmaTimeA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, toGPU); + + GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU); + + runKernel(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0])); + runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); + + mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; + + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput); + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, 0); + SynchronizeStream(0); + + unsigned int offset = 0; + unsigned int decodedAttachedClusters = 0; + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { + unsigned int linearIndex = i * GPUCA_ROW_COUNT + j; + unsigned int unattachedOffset = (linearIndex >= cmprClsHost.nSliceRows) ? 0 : cmprClsHost.nSliceRowClusters[linearIndex]; + (mClusterNativeAccess->nClusters)[i][j] = Decompressor.mNativeClustersIndex[linearIndex] + unattachedOffset; + Decompressor.mUnattachedClustersOffsets[linearIndex] = offset; + offset += unattachedOffset; + decodedAttachedClusters += Decompressor.mNativeClustersIndex[linearIndex]; + } + } + LOGP(info,"decoded = {}",decodedAttachedClusters); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression,Decompressor.mResourceTmpClustersOffsets,0); + if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) { + GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); + } else { + GPUInfo("My version: all attached clusters have been decoded"); + } + + Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput; + DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer; + mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; + mClusterNativeAccess->setOffsetPtrs(); + mIOPtrs.clustersNative = mClusterNativeAccess.get(); + *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; + processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; + WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0); + mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput; + mClusterNativeAccess->setOffsetPtrs(); + *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; + processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess; + + runKernel(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone); + + ClusterNative* tmpBuffer = new ClusterNative[mInputsHost->mNClusterNative]; + ClusterNativeAccess gpuBuffer = *mInputsHost->mPclusterNativeAccess; + gpuBuffer.clustersLinear = tmpBuffer; + //GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); + GPUMemCpy(RecoStep::TPCDecompression,tmpBuffer,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); + gpuBuffer.setOffsetPtrs(); + TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size; - this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); + //this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); return this->mInputsHost->mPclusterNativeOutput; }; auto& gatherTimer = getTimer("TPCDecompression", 0); @@ -224,7 +336,7 @@ int GPUChainTracking::RunTPCDecompression() gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); if (mRec->IsGPU()) { - AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; @@ -234,6 +346,50 @@ int GPUChainTracking::RunTPCDecompression() TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0); SynchronizeStream(0); } + + const ClusterNativeAccess* decoded = &gpuBuffer; //mIOPtrs.clustersNative; + // original = (ClusterNativeAccess*)mIOPtrs.clustersNative; + unsigned int decodingErrors = 0; + std::vector tmpClusters; + if (param().rec.tpc.rejectionStrategy == GPUSettings::RejectionNone) { // verification does not make sense if we reject clusters during compression + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { + if (original->nClusters[i][j] != decoded->nClusters[i][j]) { + GPUError("Number of clusters mismatch slice %u row %u: expected %d v.s. decoded %d", i, j, original->nClusters[i][j], decoded->nClusters[i][j]); + decodingErrors++; + continue; + } + tmpClusters.resize(original->nClusters[i][j]); + for (unsigned int k = 0; k < original->nClusters[i][j]; k++) { + tmpClusters[k] = original->clusters[i][j][k]; + if (param().rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) { + GPUTPCCompression::truncateSignificantBitsChargeMax(tmpClusters[k].qMax, param()); + GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param()); + GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaPadPacked, param()); + GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param()); + } + } + std::sort(tmpClusters.begin(), tmpClusters.end()); + for (unsigned int k = 0; k < original->nClusters[i][j]; k++) { + const o2::tpc::ClusterNative& c1 = tmpClusters[k]; + const o2::tpc::ClusterNative& c2 = decoded->clusters[i][j][k]; + if (!(c1 == c2)) { + if (decodingErrors++ < 100) { + //GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot); + //GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot); + } + } + } + } + } + if (decodingErrors) { + GPUWarning("Errors during cluster decoding %u\n", decodingErrors); + } else { + GPUInfo("Cluster decoding verification: PASSED"); + } + } + delete[] tmpBuffer; + // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR")); #endif return 0; -} +} \ No newline at end of file diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 8df8094fe44d3..5805bc386ab2e 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -667,7 +667,7 @@ int RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingUse, chainTrackingAsync->mIOPtrs.rawClusters[i] = nullptr; chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0; } - chainTrackingAsync->mIOPtrs.clustersNative = nullptr; + chainTrackingAsync->mIOPtrs.clustersNative = chainTrackingUse->mIOPtrs.clustersNative; //todo: revert back to nullptr recAsync->SetResetTimers(iRun < configStandalone.runsInit); tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { @@ -948,6 +948,7 @@ int main(int argc, char** argv) printf("Error unregistering memory\n"); } } + //exit(0); rec->Exit(); if (!configStandalone.noprompt) { diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index b06749010d2d9..d7302f17271d2 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -73,6 +73,8 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered32" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" LB simple) o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits" LB single)