From 78b07b8949dd33cfc2a6a513b6466f6b97f6a602 Mon Sep 17 00:00:00 2001 From: Gabriele Cimador Date: Thu, 16 Nov 2023 09:53:14 +0100 Subject: [PATCH 01/25] test --- .../Global/GPUChainTrackingCompression.cxx | 2 +- .../.cmake/api/v1/query/cache-v2 | 0 .../.cmake/api/v1/query/cmakeFiles-v1 | 0 .../.cmake/api/v1/query/codemodel-v2 | 0 .../.cmake/api/v1/query/toolchains-v1 | 0 cmake-build-debug/DartConfiguration.tcl | 106 ++++++++++++++++++ 6 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 cmake-build-debug/.cmake/api/v1/query/cache-v2 create mode 100644 cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 create mode 100644 cmake-build-debug/.cmake/api/v1/query/codemodel-v2 create mode 100644 cmake-build-debug/.cmake/api/v1/query/toolchains-v1 create mode 100644 cmake-build-debug/DartConfiguration.tcl diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 3c622751acad1..edbbf28be8929 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -22,7 +22,7 @@ #include "TPCClusterDecompressor.h" #endif #include "utils/strtag.h" - +//prova using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; diff --git a/cmake-build-debug/.cmake/api/v1/query/cache-v2 b/cmake-build-debug/.cmake/api/v1/query/cache-v2 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 b/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 b/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 b/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/cmake-build-debug/DartConfiguration.tcl b/cmake-build-debug/DartConfiguration.tcl new file mode 100644 index 0000000000000..2077a80fa68de --- /dev/null +++ b/cmake-build-debug/DartConfiguration.tcl @@ -0,0 +1,106 @@ +# This file is configured by CMake automatically as DartConfiguration.tcl +# If you choose not to use CMake, this file may be hand configured, by +# filling in the required variables. + + +# Configuration directories and files +SourceDirectory: /experiments/alice/cimag/o2/O2 +BuildDirectory: /experiments/alice/cimag/o2/O2/cmake-build-debug + +# Where to place the cost data store +CostDataFile: + +# Site is something like machine.domain, i.e. pragmatic.crd +Site: gr3srv.ts.infn.it + +# Build name is osname-revision-compiler, i.e. Linux-2.4.2-2smp-c++ +BuildName: Linux-c++ + +# Subprojects +LabelsForSubprojects: + +# Submission information +SubmitURL: http:// +SubmitInactivityTimeout: + +# Dashboard start time +NightlyStartTime: 00:00:00 EDT + +# Commands for the build/test/submit cycle +ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2" +MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}" +DefaultCTestConfigurationType: Release + +# version control +UpdateVersionOnly: + +# CVS options +# Default is "-d -P -A" +CVSCommand: +CVSUpdateOptions: + +# Subversion options +SVNCommand: +SVNOptions: +SVNUpdateOptions: + +# Git options +GITCommand: /usr/bin/git +GITInitSubmodules: +GITUpdateOptions: +GITUpdateCustom: + +# Perforce options +P4Command: +P4Client: +P4Options: +P4UpdateOptions: +P4UpdateCustom: + +# Generic update command +UpdateCommand: /usr/bin/git +UpdateOptions: +UpdateType: git + +# Compiler info +Compiler: /usr/bin/c++ +CompilerVersion: 4.8.5 + +# Dynamic analysis (MemCheck) +PurifyCommand: +ValgrindCommand: +ValgrindCommandOptions: +DrMemoryCommand: +DrMemoryCommandOptions: +CudaSanitizerCommand: +CudaSanitizerCommandOptions: +MemoryCheckType: +MemoryCheckSanitizerOptions: +MemoryCheckCommand: /usr/bin/valgrind +MemoryCheckCommandOptions: +MemoryCheckSuppressionFile: + +# Coverage +CoverageCommand: /usr/bin/gcov +CoverageExtraFlags: -l + +# Testing options +# TimeOut is the amount of time in seconds to wait for processes +# to complete during testing. After TimeOut seconds, the +# process will be summarily terminated. +# Currently set to 25 minutes +TimeOut: 1500 + +# During parallel testing CTest will not start a new test if doing +# so would cause the system load to exceed this value. +TestLoad: + +UseLaunchers: +CurlOptions: +# warning, if you add new options here that have to do with submit, +# you have to update cmCTestSubmitCommand.cxx + +# For CTest submissions that timeout, these options +# specify behavior for retrying the submission +CTestSubmitRetryDelay: 5 +CTestSubmitRetryCount: 3 From 7d4340dac4e6b51f9b01940d2c4f73be92e6b080 Mon Sep 17 00:00:00 2001 From: cima22 Date: Fri, 17 Nov 2023 09:18:58 +0100 Subject: [PATCH 02/25] test --- CMakeLists.txt | 3 +++ GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bf1d83fbb6644..feabbddfcf0f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,9 @@ project(O2 LANGUAGES C CXX VERSION 1.2.0) include(CTest) +set(CUDA_COMPUTETARGET 75) +set(GPUCA_NO_FAST_MATH 1) + # Project wide setup # Would better fit inside GPU/CMakeLists.txt, but include GPU/Common directly diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx index b8f491a6f5767..8ca42f8e54af6 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx @@ -24,7 +24,7 @@ using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; - +//prova int TPCClusterDecompressor::decompress(const CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param) { CompressedClusters c; From 31e10d02e2f2db2288d2e50c4a2305f6ee236b2a Mon Sep 17 00:00:00 2001 From: cima22 Date: Fri, 17 Nov 2023 09:25:54 +0100 Subject: [PATCH 03/25] test2 --- GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx index 8ca42f8e54af6..b8f491a6f5767 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx @@ -24,7 +24,7 @@ using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; -//prova + int TPCClusterDecompressor::decompress(const CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function allocator, const GPUParam& param) { CompressedClusters c; From ac8042f7a08baa4ca3ef75836f8e97d70a84f1c7 Mon Sep 17 00:00:00 2001 From: cima22 Date: Fri, 17 Nov 2023 15:16:42 +0100 Subject: [PATCH 04/25] Dummy Kernel Implementation --- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 1 + .../Base/GPUReconstructionIncludesDevice.h | 3 ++ GPU/GPUTracking/CMakeLists.txt | 1 + .../GPUTPCDecompressionKernels.cxx | 25 +++++++++++++ .../GPUTPCDecompressionKernels.h | 35 +++++++++++++++++++ .../Definitions/GPUDefGPUParameters.h | 7 ++++ .../Global/GPUChainTrackingCompression.cxx | 4 ++- 7 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 5b176218984e5..97642734a60b7 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -39,6 +39,7 @@ #include "GPUITSFitterKernels.h" #include "GPUTPCConvertKernel.h" #include "GPUTPCCompressionKernels.h" +#include "GPUTPCDecompressionKernels.h" #include "GPUTPCClusterFinderKernels.h" #include "GPUTrackingRefitKernel.h" #include "GPUTPCGMO2Output.h" diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h index 0297a1a6cb9d1..81fde1dbfb996 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h +++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h @@ -75,6 +75,9 @@ using namespace GPUCA_NAMESPACE::gpu; #include "GPUTPCCompressionKernels.cxx" #include "GPUTPCCompressionTrackModel.cxx" +//Files for TPC Decompression +#include "GPUTPCDecompressionKernels.cxx" + // Files for TPC Cluster Finder #include "ClusterAccumulator.cxx" #include "GPUTPCCFStreamCompaction.cxx" diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 0811bb6d1ad79..390773b11a81e 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -176,6 +176,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS) DataCompression/GPUTPCCompression.cxx DataCompression/GPUTPCCompressionTrackModel.cxx DataCompression/GPUTPCCompressionKernels.cxx + DataCompression/GPUTPCDecompressionKernels.cxx DataCompression/TPCClusterDecompressor.cxx DataCompression/GPUTPCClusterStatistics.cxx TPCClusterFinder/GPUTPCClusterFinder.cxx diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx new file mode 100644 index 0000000000000..693d6663d69a0 --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -0,0 +1,25 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompressionKernels.cxx +/// \author Gabriele Cimador + +#include "GPUTPCDecompressionKernels.h" +#include + +using namespace GPUCA_NAMESPACE::gpu; +using namespace o2::tpc; + +template <> +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) +{ + printf("Hello world!"); +} \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h new file mode 100644 index 0000000000000..a9a8f70dcba1e --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -0,0 +1,35 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompressionKernels.h +/// \author Gabriele Cimador + +#ifndef GPUTPCDECOMPRESSIONKERNELS_H +#define GPUTPCDECOMPRESSIONKERNELS_H + +#include "GPUGeneralKernels.h" + +namespace GPUCA_NAMESPACE::gpu +{ + +class GPUTPCDecompressionKernels : public GPUKernelTemplate +{ + public: + enum K : int { + test = 0, + }; + + template + GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors); +}; + +} +#endif // GPUTPCDECOMPRESSIONKERNELS_H diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index 07f9035329847..d88fc055099fb 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -71,6 +71,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -135,6 +136,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_test 192, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -198,6 +200,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256 #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448 @@ -262,6 +265,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256 #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128 + #define GPUCA_LB_GPUTPCDecompressionKernels_test 128 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 @@ -322,6 +326,9 @@ #ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 256 #endif + #ifndef GPUCA_LB_GPUTPCDecompressionKernels_test + #define GPUCA_LB_GPUTPCDecompressionKernels_test 256 + #endif #ifndef GPUCA_LB_GPUTPCCFDecodeZS #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4 #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index edbbf28be8929..ecff409ae1522 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -22,7 +22,7 @@ #include "TPCClusterDecompressor.h" #endif #include "utils/strtag.h" -//prova + using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; @@ -223,7 +223,9 @@ int GPUChainTracking::RunTPCDecompression() } gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); + runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); if (mRec->IsGPU()) { + runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); From 64c05c4b9384fd18b4416aaee6368a5d056f9bdb Mon Sep 17 00:00:00 2001 From: cima22 Date: Sun, 19 Nov 2023 15:24:56 +0100 Subject: [PATCH 05/25] Working dummy kernel --- .../DataCompression/GPUTPCDecompressionKernels.cxx | 3 ++- GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h | 2 ++ GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx | 4 +++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 693d6663d69a0..ccc289dd793c3 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -14,6 +14,7 @@ #include "GPUTPCDecompressionKernels.h" #include +#include "GPULogging.h" using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; @@ -21,5 +22,5 @@ using namespace o2::tpc; template <> GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) { - printf("Hello world!"); + //LOGP(info, "===== Hello world!"); } \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h index a9a8f70dcba1e..6be320026866a 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -23,6 +23,8 @@ namespace GPUCA_NAMESPACE::gpu class GPUTPCDecompressionKernels : public GPUKernelTemplate { public: + GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCDecompression; } + enum K : int { test = 0, }; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index ecff409ae1522..0901ad0c4576e 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -28,6 +28,7 @@ using namespace o2::tpc; int GPUChainTracking::RunTPCCompression() { + LOGP(info, "====== Compression"); #ifdef GPUCA_HAVE_O2HEADERS mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR")); RecoStep myStep = RecoStep::TPCCompression; @@ -207,6 +208,7 @@ int GPUChainTracking::RunTPCCompression() int GPUChainTracking::RunTPCDecompression() { + LOGP(info, "====== Decompression"); #ifdef GPUCA_HAVE_O2HEADERS const auto& threadContext = GetThreadContext(); TPCClusterDecompressor decomp; @@ -223,7 +225,7 @@ int GPUChainTracking::RunTPCDecompression() } gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); - runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); + LOGP(info, "====== isGPU: {} ", mRec->IsGPU()); if (mRec->IsGPU()) { runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); From 5257993bc2cfebf4fe706d318a917f0ba5400ce1 Mon Sep 17 00:00:00 2001 From: cima22 Date: Mon, 20 Nov 2023 15:28:17 +0100 Subject: [PATCH 06/25] Created GPUTPCDecompressor context and added to constant memory model --- GPU/GPUTracking/Base/GPUConstantMem.h | 2 ++ GPU/GPUTracking/CMakeLists.txt | 1 + .../DataCompression/GPUTPCDecompression.cxx | 15 ++++++++ .../DataCompression/GPUTPCDecompression.h | 34 +++++++++++++++++++ .../Global/GPUChainTrackingCompression.cxx | 5 +++ 5 files changed, 57 insertions(+) create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompression.h diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h index b9b93100d5bfd..d18e7863e2001 100644 --- a/GPU/GPUTracking/Base/GPUConstantMem.h +++ b/GPU/GPUTracking/Base/GPUConstantMem.h @@ -46,6 +46,7 @@ class GPUTRDTracker_t #if defined(GPUCA_NOCOMPAT_ALLCINT) && (!defined(GPUCA_GPULIBRARY) || !defined(GPUCA_ALIROOT_LIB)) && defined(GPUCA_HAVE_O2HEADERS) #include "GPUTPCConvert.h" #include "GPUTPCCompression.h" +#include "GPUTPCDecompression.h" #include "GPUITSFitter.h" #include "GPUTPCClusterFinder.h" #include "GPUTrackingRefit.h" @@ -69,6 +70,7 @@ struct GPUConstantMem { tpcTrackers[GPUCA_NSLICES]; GPUTPCConvert tpcConverter; GPUTPCCompression tpcCompressor; + GPUTPCDecompression tpcDecompressor; GPUTPCGMMerger tpcMerger; GPUTRDTrackerGPU trdTrackerGPU; #ifdef GPUCA_HAVE_O2HEADERS diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 390773b11a81e..db2d3f7c0accd 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -176,6 +176,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS) DataCompression/GPUTPCCompression.cxx DataCompression/GPUTPCCompressionTrackModel.cxx DataCompression/GPUTPCCompressionKernels.cxx + DataCompression/GPUTPCDecompression.cxx DataCompression/GPUTPCDecompressionKernels.cxx DataCompression/TPCClusterDecompressor.cxx DataCompression/GPUTPCClusterStatistics.cxx diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx new file mode 100644 index 0000000000000..34a790703fdfc --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -0,0 +1,15 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompression.cxx +/// \author Gabriele Cimador + +#include "GPUTPCDecompression.h" diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h new file mode 100644 index 0000000000000..c7e3ccb1eb762 --- /dev/null +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -0,0 +1,34 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCDecompression.h +/// \author Gabriele Cimador + +#ifndef GPUTPCDECOMPRESSION_H +#define GPUTPCDECOMPRESSION_H + +#include "GPUDef.h" +#include "GPUProcessor.h" +#include "GPUCommonMath.h" +#include "GPUParam.h" + +namespace GPUCA_NAMESPACE::gpu +{ +class GPUTPCGMMerger; + +class GPUTPCDecompression : public GPUProcessor +{ + friend class GPUTPCDecmpressionKernels; + friend class GPUChainTracking; + +}; +} +#endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 0901ad0c4576e..f75f79f3cc271 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -209,7 +209,10 @@ int GPUChainTracking::RunTPCCompression() int GPUChainTracking::RunTPCDecompression() { LOGP(info, "====== Decompression"); + #ifdef GPUCA_HAVE_O2HEADERS + RecoStep myStep = RecoStep::TPCDecompression; + bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true const auto& threadContext = GetThreadContext(); TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { @@ -241,3 +244,5 @@ int GPUChainTracking::RunTPCDecompression() #endif return 0; } + + From 7405a42e3b647217d86e66fc6a3388d2cce60328 Mon Sep 17 00:00:00 2001 From: cima22 Date: Wed, 22 Nov 2023 12:04:56 +0100 Subject: [PATCH 07/25] GPUTPCDecompressor now able to move variables to GPU version --- .../DataCompression/GPUTPCDecompression.cxx | 16 ++++++++++++++++ .../DataCompression/GPUTPCDecompression.h | 9 ++++++++- .../GPUTPCDecompressionKernels.cxx | 12 ++++++++++-- GPU/GPUTracking/Global/GPUChainTracking.cxx | 6 ++++++ .../Global/GPUChainTrackingCompression.cxx | 8 +++++++- 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 34a790703fdfc..5a1d3aa84e3ef 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -13,3 +13,19 @@ /// \author Gabriele Cimador #include "GPUTPCDecompression.h" +#include "GPUReconstruction.h" +#include "GPUO2DataTypes.h" +#include "GPUMemorySizeScalers.h" + +using namespace GPUCA_NAMESPACE::gpu; + +void GPUTPCDecompression::InitializeProcessor() {} + +void GPUTPCDecompression::RegisterMemoryAllocation() +{ +} + +void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io) +{ + +} \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index c7e3ccb1eb762..9e4def065121f 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -22,13 +22,20 @@ namespace GPUCA_NAMESPACE::gpu { -class GPUTPCGMMerger; class GPUTPCDecompression : public GPUProcessor { friend class GPUTPCDecmpressionKernels; friend class GPUChainTracking; + public: + unsigned int test = 42; + unsigned int* testP; +#ifndef GPUCA_GPUCODE + void InitializeProcessor(); + void RegisterMemoryAllocation(); + void SetMaxData(const GPUTrackingInOutPointers& io); +#endif }; } #endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index ccc289dd793c3..9cd1ef7484473 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -13,8 +13,9 @@ /// \author Gabriele Cimador #include "GPUTPCDecompressionKernels.h" -#include #include "GPULogging.h" +#include "GPUConstantMem.h" + using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; @@ -22,5 +23,12 @@ using namespace o2::tpc; template <> GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) { - //LOGP(info, "===== Hello world!"); + GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor; + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + unsigned int x = decompressor.test; + unsigned int y = compressor.mTest; + if (!iThread && !iBlock) { + GPUInfo("==== Test: X={%d} Y={%d} \n", x, y); + } + } \ No newline at end of file diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 403aa0dae2800..1a0aba3533b4f 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -105,6 +105,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors() if (GetRecoSteps() & RecoStep::TPCCompression) { mRec->RegisterGPUProcessor(&processors()->tpcCompressor, GetRecoStepsGPU() & RecoStep::TPCCompression); } + if (GetRecoSteps() & RecoStep::TPCDecompression) { + mRec->RegisterGPUProcessor(&processors()->tpcDecompressor, GetRecoStepsGPU() & RecoStep::TPCDecompression); + } if (GetRecoSteps() & RecoStep::TPCClusterFinding) { for (unsigned int i = 0; i < NSLICES; i++) { mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding); @@ -149,6 +152,9 @@ void GPUChainTracking::RegisterGPUProcessors() if (GetRecoStepsGPU() & RecoStep::TPCCompression) { mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcCompressor, &processors()->tpcCompressor); } + if (GetRecoStepsGPU() & RecoStep::TPCDecompression) { + mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcDecompressor, &processors()->tpcDecompressor); + } if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) { for (unsigned int i = 0; i < NSLICES; i++) { mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index f75f79f3cc271..56c8209ae0d81 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -45,7 +45,6 @@ int GPUChainTracking::RunTPCCompression() } SetupGPUProcessor(&Compressor, true); new (Compressor.mMemory) GPUTPCCompression::memory; - WriteToConstantMemory(myStep, (char*)&processors()->tpcCompressor - (char*)processors(), &CompressorShadow, sizeof(CompressorShadow), 0); TransferMemoryResourcesToGPU(myStep, &Compressor, 0); runKernel(GetGridAutoStep(0, RecoStep::TPCCompression), krnlRunRangeNone, krnlEventNone, CompressorShadow.mClusterStatus, Compressor.mMaxClusters * sizeof(CompressorShadow.mClusterStatus[0])); @@ -211,9 +210,16 @@ int GPUChainTracking::RunTPCDecompression() LOGP(info, "====== Decompression"); #ifdef GPUCA_HAVE_O2HEADERS + mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); RecoStep myStep = RecoStep::TPCDecompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true + GPUTPCDecompression& Decompressor = processors()->tpcDecompressor; + GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor; const auto& threadContext = GetThreadContext(); + SetupGPUProcessor(&Decompressor, false); + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); + TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); + TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size; From ceb00a6cdfc6cbe65f712d2b2fbac8149705cee7 Mon Sep 17 00:00:00 2001 From: cima22 Date: Thu, 23 Nov 2023 15:34:42 +0100 Subject: [PATCH 08/25] Test memcopy from host to gpu working and fixed typo --- GPU/GPUTracking/Base/GPUReconstruction.h | 2 +- GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx | 9 ++++++++- GPU/GPUTracking/DataCompression/GPUTPCDecompression.h | 5 ++++- .../DataCompression/GPUTPCDecompressionKernels.cxx | 3 +-- GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx | 8 +++++--- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 8b56e46888e9b..518f0af43aae4 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -511,7 +511,7 @@ inline void GPUReconstruction::RegisterGPUProcessor(T* proc, bool deviceSlave) template inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate) { - static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derrived class"); + static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class"); if (allocate) { proc->SetMaxData(mHostConstantMem->ioPtrs); } diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 5a1d3aa84e3ef..b0ed4fded0eb1 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -21,8 +21,15 @@ using namespace GPUCA_NAMESPACE::gpu; void GPUTPCDecompression::InitializeProcessor() {} -void GPUTPCDecompression::RegisterMemoryAllocation() +void* GPUTPCDecompression::SetPointersMemory(void* mem) { + computePointerWithAlignment(mem, testP); + return mem; +} + +void GPUTPCDecompression::RegisterMemoryAllocation() +{ AllocateAndInitializeLate(); + mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory"); } void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 9e4def065121f..86c851e015036 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -30,7 +30,10 @@ class GPUTPCDecompression : public GPUProcessor public: unsigned int test = 42; - unsigned int* testP; + unsigned int* testP = nullptr; + + void* SetPointersMemory(void* mem); + #ifndef GPUCA_GPUCODE void InitializeProcessor(); void RegisterMemoryAllocation(); diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 9cd1ef7484473..3179cb6d5650b 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -26,9 +26,8 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadPushNonPersistentMemory(qStr2Tag("TPCDCMPR")); + // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); RecoStep myStep = RecoStep::TPCDecompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true GPUTPCDecompression& Decompressor = processors()->tpcDecompressor; GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor; const auto& threadContext = GetThreadContext(); SetupGPUProcessor(&Decompressor, false); + new (Decompressor.testP) unsigned int; + *Decompressor.testP = 145; WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); - + LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP); TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size; @@ -234,7 +236,6 @@ int GPUChainTracking::RunTPCDecompression() } gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); - LOGP(info, "====== isGPU: {} ", mRec->IsGPU()); if (mRec->IsGPU()) { runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); @@ -247,6 +248,7 @@ int GPUChainTracking::RunTPCDecompression() TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0); SynchronizeStream(0); } + // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR")); #endif return 0; } From ae2b6c44c2359ad25826d47c20bbac4cc5800335 Mon Sep 17 00:00:00 2001 From: cima22 Date: Sat, 2 Dec 2023 10:11:18 +0100 Subject: [PATCH 09/25] Created inputBuffer on GPU for holding compressed clusters for decompression --- .../DataCompression/GPUTPCDecompression.cxx | 52 +++++++++++++++++-- .../DataCompression/GPUTPCDecompression.h | 30 ++++++++++- .../Global/GPUChainTrackingCompression.cxx | 18 ++++--- cmake-build-debug/DartConfiguration.tcl | 4 +- 4 files changed, 89 insertions(+), 15 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index b0ed4fded0eb1..760e5174bd827 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -13,6 +13,7 @@ /// \author Gabriele Cimador #include "GPUTPCDecompression.h" +#include "GPUTPCCompression.h" #include "GPUReconstruction.h" #include "GPUO2DataTypes.h" #include "GPUMemorySizeScalers.h" @@ -21,18 +22,61 @@ using namespace GPUCA_NAMESPACE::gpu; void GPUTPCDecompression::InitializeProcessor() {} +void* GPUTPCDecompression::SetPointersInputGPU(void* mem) +{ + SetPointersCompressedClusters(mem, *mInputGPU, mInputGPU->nAttachedClusters, mInputGPU->nTracks, mInputGPU->nUnattachedClusters, true); + return mem; +} + +template +void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA) +{ + computePointerWithAlignment(mem, c.qTotU, nClU); // Do not reorder, qTotU ist used as first address in GPUChainTracking::RunTPCCompression + computePointerWithAlignment(mem, c.qMaxU, nClU); + computePointerWithAlignment(mem, c.flagsU, nClU); + computePointerWithAlignment(mem, c.padDiffU, nClU); + computePointerWithAlignment(mem, c.timeDiffU, nClU); + computePointerWithAlignment(mem, c.sigmaPadU, nClU); + computePointerWithAlignment(mem, c.sigmaTimeU, nClU); + computePointerWithAlignment(mem, c.nSliceRowClusters, GPUCA_ROW_COUNT * NSLICES); + + unsigned int nClAreduced = reducedClA ? nClA - nTr : nClA; + + if (!(mRec->GetParam().rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) { + return; // Track model disabled, do not allocate memory + } + computePointerWithAlignment(mem, c.qTotA, nClA); + computePointerWithAlignment(mem, c.qMaxA, nClA); + computePointerWithAlignment(mem, c.flagsA, nClA); + computePointerWithAlignment(mem, c.rowDiffA, nClAreduced); + computePointerWithAlignment(mem, c.sliceLegDiffA, nClAreduced); + computePointerWithAlignment(mem, c.padResA, nClAreduced); + computePointerWithAlignment(mem, c.timeResA, nClAreduced); + computePointerWithAlignment(mem, c.sigmaPadA, nClA); + computePointerWithAlignment(mem, c.sigmaTimeA, nClA); + + computePointerWithAlignment(mem, c.qPtA, nTr); + computePointerWithAlignment(mem, c.rowA, nTr); + computePointerWithAlignment(mem, c.sliceA, nTr); + computePointerWithAlignment(mem, c.timeA, nTr); + computePointerWithAlignment(mem, c.padA, nTr); + + computePointerWithAlignment(mem, c.nTrackClusters, nTr); +} + void* GPUTPCDecompression::SetPointersMemory(void* mem) { + computePointerWithAlignment(mem, mInputGPU); computePointerWithAlignment(mem, testP); return mem; } -void GPUTPCDecompression::RegisterMemoryAllocation() -{ AllocateAndInitializeLate(); +void GPUTPCDecompression::RegisterMemoryAllocation() { + AllocateAndInitializeLate(); + mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT, "TPCDecompressionInput"); mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory"); } -void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io) -{ +void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){ } \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 86c851e015036..37cc883e1db9c 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -20,6 +20,20 @@ #include "GPUCommonMath.h" #include "GPUParam.h" +#ifdef GPUCA_HAVE_O2HEADERS +#include "DataFormatsTPC/CompressedClusters.h" +#else +namespace o2::tpc +{ +struct CompressedClustersPtrs { +}; +struct CompressedClusters { +}; +struct CompressedClustersFlat { +}; +} // namespace o2::tpc +#endif + namespace GPUCA_NAMESPACE::gpu { @@ -32,13 +46,25 @@ class GPUTPCDecompression : public GPUProcessor unsigned int test = 42; unsigned int* testP = nullptr; - void* SetPointersMemory(void* mem); - #ifndef GPUCA_GPUCODE void InitializeProcessor(); void RegisterMemoryAllocation(); void SetMaxData(const GPUTrackingInOutPointers& io); + + void* SetPointersInputGPU(void* mem); + void* SetPointersMemory(void* mem); #endif + + protected: + constexpr static unsigned int NSLICES = GPUCA_NSLICES; + + template + void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); + + o2::tpc::CompressedClusters* mInputGPU = nullptr; + + short mMemoryResInputGPU = -1; + }; } #endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index d6cd5a26c0028..2de1f8d08c649 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -29,7 +29,7 @@ using namespace o2::tpc; int GPUChainTracking::RunTPCCompression() { LOGP(info, "====== Compression"); -#ifdef GPUCA_HAVE_O2HEADERS +//#ifdef GPUCA_HAVE_O2HEADERS mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR")); RecoStep myStep = RecoStep::TPCCompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCCompression; @@ -201,7 +201,7 @@ int GPUChainTracking::RunTPCCompression() ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec); } mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR")); -#endif +//#endif return 0; } @@ -209,7 +209,7 @@ int GPUChainTracking::RunTPCDecompression() { LOGP(info, "====== Decompression"); -#ifdef GPUCA_HAVE_O2HEADERS +//#ifdef GPUCA_HAVE_O2HEADERS // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); RecoStep myStep = RecoStep::TPCDecompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true @@ -217,11 +217,16 @@ int GPUChainTracking::RunTPCDecompression() GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor; const auto& threadContext = GetThreadContext(); SetupGPUProcessor(&Decompressor, false); + new (Decompressor.testP) unsigned int; *Decompressor.testP = 145; + LOGP(info,"==== mInputGPU pointer host: {} -- dev: {}", (void*)Decompressor.mInputGPU,(void*)DecompressorShadow.mInputGPU); + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); + LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP); + TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size; @@ -238,6 +243,7 @@ int GPUChainTracking::RunTPCDecompression() mIOPtrs.clustersNative = mClusterNativeAccess.get(); if (mRec->IsGPU()) { runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); @@ -249,8 +255,6 @@ int GPUChainTracking::RunTPCDecompression() SynchronizeStream(0); } // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR")); -#endif +//#endif return 0; -} - - +} \ No newline at end of file diff --git a/cmake-build-debug/DartConfiguration.tcl b/cmake-build-debug/DartConfiguration.tcl index 2077a80fa68de..09b3136cfee5f 100644 --- a/cmake-build-debug/DartConfiguration.tcl +++ b/cmake-build-debug/DartConfiguration.tcl @@ -27,8 +27,8 @@ SubmitInactivityTimeout: NightlyStartTime: 00:00:00 EDT # Commands for the build/test/submit cycle -ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2" -MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}" +ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2" +MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}" DefaultCTestConfigurationType: Release # version control From 9fd661f7c0df9d62a5cefabe196c7b2e555189af Mon Sep 17 00:00:00 2001 From: cima22 Date: Mon, 4 Dec 2023 12:04:00 +0100 Subject: [PATCH 10/25] Correctly setted attributes on compressed clusters input for decompression --- .../DataCompression/GPUTPCDecompression.cxx | 1 + .../DataCompression/GPUTPCDecompression.h | 3 +- .../GPUTPCDecompressionKernels.cxx | 3 ++ .../Global/GPUChainTrackingCompression.cxx | 33 +++++++++++++++++-- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 760e5174bd827..bfd2cc94ccca8 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -17,6 +17,7 @@ #include "GPUReconstruction.h" #include "GPUO2DataTypes.h" #include "GPUMemorySizeScalers.h" +#include "GPULogging.h" using namespace GPUCA_NAMESPACE::gpu; diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 37cc883e1db9c..1df3d352ae1ec 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -45,6 +45,8 @@ class GPUTPCDecompression : public GPUProcessor public: unsigned int test = 42; unsigned int* testP = nullptr; + o2::tpc::CompressedClusters* mInputGPU = nullptr; // TODO: move to protected once test is assessed + #ifndef GPUCA_GPUCODE void InitializeProcessor(); @@ -61,7 +63,6 @@ class GPUTPCDecompression : public GPUProcessor template void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); - o2::tpc::CompressedClusters* mInputGPU = nullptr; short mMemoryResInputGPU = -1; diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 3179cb6d5650b..cc7557dd6e523 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -28,6 +28,9 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadnAttachedClusters); + GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU->nAttachedClusters,decompressor.mInputGPU->nUnattachedClusters,decompressor.mInputGPU->nTracks); + } } \ No newline at end of file diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 2de1f8d08c649..326531587f75b 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -216,16 +216,40 @@ int GPUChainTracking::RunTPCDecompression() GPUTPCDecompression& Decompressor = processors()->tpcDecompressor; GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor; const auto& threadContext = GetThreadContext(); - SetupGPUProcessor(&Decompressor, false); + + CompressedClusters c; + const CompressedClusters* p; + CompressedClusters* compressedClustersInputGPU = Decompressor.mInputGPU; + memset((void*)compressedClustersInputGPU, 0, sizeof(*compressedClustersInputGPU)); + c = *mIOPtrs.tpcCompressedClusters; + p = &c; + compressedClustersInputGPU->nAttachedClusters = p->nAttachedClusters; + compressedClustersInputGPU->nUnattachedClusters = p->nUnattachedClusters; + compressedClustersInputGPU->nTracks = p->nTracks; + compressedClustersInputGPU->nAttachedClustersReduced = compressedClustersInputGPU->nAttachedClusters - compressedClustersInputGPU->nTracks; + compressedClustersInputGPU->nSliceRows = NSLICES * GPUCA_ROW_COUNT; + compressedClustersInputGPU->nComppressionModes = param().rec.tpc.compressionTypeMask; + compressedClustersInputGPU->solenoidBz = param().bzkG; + compressedClustersInputGPU->maxTimeBin = param().par.continuousMaxTimeBin; + SetupGPUProcessor(&Decompressor, true); new (Decompressor.testP) unsigned int; *Decompressor.testP = 145; - LOGP(info,"==== mInputGPU pointer host: {} -- dev: {}", (void*)Decompressor.mInputGPU,(void*)DecompressorShadow.mInputGPU); WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); + TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP); + unsigned int offset = 0; + char direction = 0; + int outputStream = 0; + //GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mInputGPU->nSliceRowClusters[0]), outputStream, direction); + // GPUMemCpyAlways(myStep, O->nTrackClusters, P->nTrackClusters, O->nTracks * sizeof(O->nTrackClusters[0]), outputStream, direction); + /* for (int i = 0; i < compressedClustersInputGPU->nTracks; ++i) { + GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->qTotA + offset, p->qTotA + p.mAttachedClusterFirstIndex[i], compressedClustersInputGPU->nTrackClusters[i] * sizeof(DecompressorShadow.mInputGPU->qTotA[0]), outputStream, direction); + offset += O->nTrackClusters[i]; + }*/ TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { @@ -235,6 +259,9 @@ int GPUChainTracking::RunTPCDecompression() }; auto& gatherTimer = getTimer("TPCDecompression", 0); gatherTimer.Start(); + + LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",p->nAttachedClusters,p->nUnattachedClusters,p->nTracks); + if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) { GPUError("Error decompressing clusters"); return 1; @@ -242,7 +269,7 @@ int GPUChainTracking::RunTPCDecompression() gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); if (mRec->IsGPU()) { - runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); + runKernel({1,1,0}, krnlRunRangeNone, krnlEventNone); AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; From 12c85d5ed3c7843ea97c2f73cb8d3b62cf72c611 Mon Sep 17 00:00:00 2001 From: cima22 Date: Mon, 4 Dec 2023 19:14:56 +0100 Subject: [PATCH 11/25] successfully copied nSliceRowClusters from host to gpu through context --- .../DataCompression/GPUTPCDecompression.cxx | 6 ++--- .../DataCompression/GPUTPCDecompression.h | 2 +- .../GPUTPCDecompressionKernels.cxx | 5 ++-- .../Global/GPUChainTrackingCompression.cxx | 24 +++++++++---------- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index bfd2cc94ccca8..24c0290675ad9 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -25,7 +25,7 @@ void GPUTPCDecompression::InitializeProcessor() {} void* GPUTPCDecompression::SetPointersInputGPU(void* mem) { - SetPointersCompressedClusters(mem, *mInputGPU, mInputGPU->nAttachedClusters, mInputGPU->nTracks, mInputGPU->nUnattachedClusters, true); + SetPointersCompressedClusters(mem, mInputGPU, mInputGPU.nAttachedClusters, mInputGPU.nTracks, mInputGPU.nUnattachedClusters, true); return mem; } @@ -67,14 +67,14 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign void* GPUTPCDecompression::SetPointersMemory(void* mem) { - computePointerWithAlignment(mem, mInputGPU); + //computePointerWithAlignment(mem, mInputGPU); computePointerWithAlignment(mem, testP); return mem; } void GPUTPCDecompression::RegisterMemoryAllocation() { AllocateAndInitializeLate(); - mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT, "TPCDecompressionInput"); + mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU, "TPCDecompressionInput"); mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory"); } diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 1df3d352ae1ec..5eb24fb3056f3 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -45,7 +45,7 @@ class GPUTPCDecompression : public GPUProcessor public: unsigned int test = 42; unsigned int* testP = nullptr; - o2::tpc::CompressedClusters* mInputGPU = nullptr; // TODO: move to protected once test is assessed + o2::tpc::CompressedClusters mInputGPU; // TODO: move to protected once test is assessed #ifndef GPUCA_GPUCODE diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index cc7557dd6e523..dcfc2712fb6fd 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -28,9 +28,8 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadnAttachedClusters); - GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU->nAttachedClusters,decompressor.mInputGPU->nUnattachedClusters,decompressor.mInputGPU->nTracks); - + GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU.nAttachedClusters,decompressor.mInputGPU.nUnattachedClusters,decompressor.mInputGPU.nTracks); + GPUInfo("===== nSliceRowClusters[1] on gpu = %d",decompressor.mInputGPU.nSliceRowClusters[1]); } } \ No newline at end of file diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 326531587f75b..c29aa3f1860d9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -219,18 +219,17 @@ int GPUChainTracking::RunTPCDecompression() CompressedClusters c; const CompressedClusters* p; - CompressedClusters* compressedClustersInputGPU = Decompressor.mInputGPU; - memset((void*)compressedClustersInputGPU, 0, sizeof(*compressedClustersInputGPU)); + CompressedClusters& compressedClustersInputGPU = Decompressor.mInputGPU; c = *mIOPtrs.tpcCompressedClusters; p = &c; - compressedClustersInputGPU->nAttachedClusters = p->nAttachedClusters; - compressedClustersInputGPU->nUnattachedClusters = p->nUnattachedClusters; - compressedClustersInputGPU->nTracks = p->nTracks; - compressedClustersInputGPU->nAttachedClustersReduced = compressedClustersInputGPU->nAttachedClusters - compressedClustersInputGPU->nTracks; - compressedClustersInputGPU->nSliceRows = NSLICES * GPUCA_ROW_COUNT; - compressedClustersInputGPU->nComppressionModes = param().rec.tpc.compressionTypeMask; - compressedClustersInputGPU->solenoidBz = param().bzkG; - compressedClustersInputGPU->maxTimeBin = param().par.continuousMaxTimeBin; + compressedClustersInputGPU.nAttachedClusters = p->nAttachedClusters; + compressedClustersInputGPU.nUnattachedClusters = p->nUnattachedClusters; + compressedClustersInputGPU.nTracks = p->nTracks; + compressedClustersInputGPU.nAttachedClustersReduced = compressedClustersInputGPU.nAttachedClusters - compressedClustersInputGPU.nTracks; + compressedClustersInputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT; + compressedClustersInputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask; + compressedClustersInputGPU.solenoidBz = param().bzkG; + compressedClustersInputGPU.maxTimeBin = param().par.continuousMaxTimeBin; SetupGPUProcessor(&Decompressor, true); new (Decompressor.testP) unsigned int; @@ -242,9 +241,10 @@ int GPUChainTracking::RunTPCDecompression() LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP); unsigned int offset = 0; - char direction = 0; + char direction = 1; int outputStream = 0; - //GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mInputGPU->nSliceRowClusters[0]), outputStream, direction); + GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU.nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(p->nSliceRowClusters[0]), outputStream, direction); + LOGP(info,"===== nSliceRowClusters[1] on host = {}",p->nSliceRowClusters[1]); // GPUMemCpyAlways(myStep, O->nTrackClusters, P->nTrackClusters, O->nTracks * sizeof(O->nTrackClusters[0]), outputStream, direction); /* for (int i = 0; i < compressedClustersInputGPU->nTracks; ++i) { GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->qTotA + offset, p->qTotA + p.mAttachedClusterFirstIndex[i], compressedClustersInputGPU->nTrackClusters[i] * sizeof(DecompressorShadow.mInputGPU->qTotA[0]), outputStream, direction); From bc8b2d0f99e9d8e7270c9049bc5d36e872fa3545 Mon Sep 17 00:00:00 2001 From: cima22 Date: Tue, 5 Dec 2023 15:31:27 +0100 Subject: [PATCH 12/25] input completely copied from host to device (correctness of content has to be checked) --- .../Global/GPUChainTrackingCompression.cxx | 71 +++++++++++++------ 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index c29aa3f1860d9..f0023931916ee 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -217,19 +217,16 @@ int GPUChainTracking::RunTPCDecompression() GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor; const auto& threadContext = GetThreadContext(); - CompressedClusters c; - const CompressedClusters* p; - CompressedClusters& compressedClustersInputGPU = Decompressor.mInputGPU; - c = *mIOPtrs.tpcCompressedClusters; - p = &c; - compressedClustersInputGPU.nAttachedClusters = p->nAttachedClusters; - compressedClustersInputGPU.nUnattachedClusters = p->nUnattachedClusters; - compressedClustersInputGPU.nTracks = p->nTracks; - compressedClustersInputGPU.nAttachedClustersReduced = compressedClustersInputGPU.nAttachedClusters - compressedClustersInputGPU.nTracks; - compressedClustersInputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT; - compressedClustersInputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask; - compressedClustersInputGPU.solenoidBz = param().bzkG; - compressedClustersInputGPU.maxTimeBin = param().par.continuousMaxTimeBin; + CompressedClusters cmprClsHost = *mIOPtrs.tpcCompressedClusters; + CompressedClusters& inputGPU = Decompressor.mInputGPU; + inputGPU.nAttachedClusters = cmprClsHost.nAttachedClusters; + inputGPU.nUnattachedClusters = cmprClsHost.nUnattachedClusters; + inputGPU.nTracks = cmprClsHost.nTracks; + inputGPU.nAttachedClustersReduced = inputGPU.nAttachedClusters - inputGPU.nTracks; + inputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT; + inputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask; + inputGPU.solenoidBz = param().bzkG; + inputGPU.maxTimeBin = param().par.continuousMaxTimeBin; SetupGPUProcessor(&Decompressor, true); new (Decompressor.testP) unsigned int; @@ -238,18 +235,48 @@ int GPUChainTracking::RunTPCDecompression() WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); - + LOGP(info,"==== nSliceRowClusters[1] on host: {}",cmprClsHost.nSliceRowClusters[1]); LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP); unsigned int offset = 0; char direction = 1; int outputStream = 0; - GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU.nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(p->nSliceRowClusters[0]), outputStream, direction); - LOGP(info,"===== nSliceRowClusters[1] on host = {}",p->nSliceRowClusters[1]); - // GPUMemCpyAlways(myStep, O->nTrackClusters, P->nTrackClusters, O->nTracks * sizeof(O->nTrackClusters[0]), outputStream, direction); - /* for (int i = 0; i < compressedClustersInputGPU->nTracks; ++i) { - GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->qTotA + offset, p->qTotA + p.mAttachedClusterFirstIndex[i], compressedClustersInputGPU->nTrackClusters[i] * sizeof(DecompressorShadow.mInputGPU->qTotA[0]), outputStream, direction); - offset += O->nTrackClusters[i]; - }*/ + CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; + GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction); + SynchronizeStream(outputStream); + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { + GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU + offset, cmprClsHost.qMaxU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qMaxU[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.flagsU + offset, cmprClsHost.flagsU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.flagsU[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU + offset, cmprClsHost.padDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.padDiffU[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU + offset, cmprClsHost.timeDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.timeDiffU[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU + offset, cmprClsHost.sigmaPadU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU + offset, cmprClsHost.sigmaTimeU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, direction); + offset += cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; + } + } + + offset = 0; + for (unsigned int i = 0; i < inputGPUShadow.nTracks; i++) { + GPUMemCpyAlways(myStep, inputGPUShadow.qTotA + offset, cmprClsHost.qTotA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qTotA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA + offset, cmprClsHost.qMaxA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qMaxA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.flagsA + offset, cmprClsHost.flagsA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.flagsA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA + offset, cmprClsHost.sigmaPadA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA + offset, cmprClsHost.sigmaTimeA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, direction); + + // First index stored with track + GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA + offset - i, cmprClsHost.rowDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA + offset - i, cmprClsHost.sliceLegDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.padResA + offset - i, cmprClsHost.padResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.padResA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.timeResA + offset - i, cmprClsHost.timeResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.timeResA[0]), outputStream, direction); + offset += cmprClsHost.nTrackClusters[i]; + } + GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, direction); TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { @@ -260,7 +287,7 @@ int GPUChainTracking::RunTPCDecompression() auto& gatherTimer = getTimer("TPCDecompression", 0); gatherTimer.Start(); - LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",p->nAttachedClusters,p->nUnattachedClusters,p->nTracks); + LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",cmprClsHost.nAttachedClusters,cmprClsHost.nUnattachedClusters,cmprClsHost.nTracks); if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) { GPUError("Error decompressing clusters"); From ac02a0bbdcc2d8509c2c60d8373c4aec59553819 Mon Sep 17 00:00:00 2001 From: cima22 Date: Tue, 5 Dec 2023 17:12:58 +0100 Subject: [PATCH 13/25] addedo MEMORY::CUSTOM to GPUInput and single GPUMemCpy for whole input --- .../DataCompression/GPUTPCDecompression.cxx | 10 +--------- .../DataCompression/GPUTPCDecompression.h | 13 +++---------- .../GPUTPCDecompressionKernels.cxx | 2 -- .../Global/GPUChainTrackingCompression.cxx | 19 +++++++++---------- 4 files changed, 13 insertions(+), 31 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 24c0290675ad9..03595069a52f0 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -65,17 +65,9 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign computePointerWithAlignment(mem, c.nTrackClusters, nTr); } -void* GPUTPCDecompression::SetPointersMemory(void* mem) -{ - //computePointerWithAlignment(mem, mInputGPU); - computePointerWithAlignment(mem, testP); - return mem; -} - void GPUTPCDecompression::RegisterMemoryAllocation() { AllocateAndInitializeLate(); - mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU, "TPCDecompressionInput"); - mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory"); + mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput"); } void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){ diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 5eb24fb3056f3..42bbb6390f5db 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -39,15 +39,10 @@ namespace GPUCA_NAMESPACE::gpu class GPUTPCDecompression : public GPUProcessor { - friend class GPUTPCDecmpressionKernels; + friend class GPUTPCDecompressionKernels; friend class GPUChainTracking; public: - unsigned int test = 42; - unsigned int* testP = nullptr; - o2::tpc::CompressedClusters mInputGPU; // TODO: move to protected once test is assessed - - #ifndef GPUCA_GPUCODE void InitializeProcessor(); void RegisterMemoryAllocation(); @@ -59,13 +54,11 @@ class GPUTPCDecompression : public GPUProcessor protected: constexpr static unsigned int NSLICES = GPUCA_NSLICES; - + o2::tpc::CompressedClusters mInputGPU; template void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); - short mMemoryResInputGPU = -1; - }; -} +} // namespace GPUCA_NAMESPACE::gpu #endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index dcfc2712fb6fd..c49e62c34b581 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -25,9 +25,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread= 2) { if (ProcessingSettings().tpcCompressionGatherMode == 2) { void* devicePtr = mRec->getGPUPointer(Compressor.mOutputFlat); @@ -229,21 +228,21 @@ int GPUChainTracking::RunTPCDecompression() inputGPU.maxTimeBin = param().par.continuousMaxTimeBin; SetupGPUProcessor(&Decompressor, true); - new (Decompressor.testP) unsigned int; - *Decompressor.testP = 145; - + size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU); WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); - TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); + LOGP(info,"==== nSliceRowClusters[1] on host: {}",cmprClsHost.nSliceRowClusters[1]); - LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP); + unsigned int offset = 0; char direction = 1; int outputStream = 0; CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; - GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction); - SynchronizeStream(outputStream); + char* deviceFlatPts = (char*)inputGPUShadow.qTotU; + GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true); + //GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction); + //GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction); + SynchronizeStream(outputStream);/* for (unsigned int i = 0; i < NSLICES; i++) { for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction); @@ -277,7 +276,7 @@ int GPUChainTracking::RunTPCDecompression() GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, direction); GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, direction); GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, direction); - +*/ TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size; From 47e7e3430ceabf49fd33cfa8daaceb3cd827e80d Mon Sep 17 00:00:00 2001 From: cima22 Date: Tue, 5 Dec 2023 17:46:15 +0100 Subject: [PATCH 14/25] refactoring and tested correctness of GPUMemCpy from host to device for decompression input --- .../DataCompression/GPUTPCDecompressionKernels.cxx | 1 - .../Global/GPUChainTrackingCompression.cxx | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index c49e62c34b581..65c6fc0e6587c 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -27,7 +27,6 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadtpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); - LOGP(info,"==== nSliceRowClusters[1] on host: {}",cmprClsHost.nSliceRowClusters[1]); - - unsigned int offset = 0; - char direction = 1; int outputStream = 0; CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; char* deviceFlatPts = (char*)inputGPUShadow.qTotU; GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true); - //GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction); - //GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction); - SynchronizeStream(outputStream);/* + SynchronizeStream(outputStream); + /* + unsigned int offset = 0; + char direction = 1; + GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction); + for (unsigned int i = 0; i < NSLICES; i++) { for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction); From a62865dbe89e1bdbe0a343f2567bab13703f7e7b Mon Sep 17 00:00:00 2001 From: cima22 Date: Thu, 7 Dec 2023 18:02:42 +0100 Subject: [PATCH 15/25] allocated output buffers for host and device --- GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 60e757e66683e..de7836e7fde0a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -237,6 +237,14 @@ int GPUChainTracking::RunTPCDecompression() char* deviceFlatPts = (char*)inputGPUShadow.qTotU; GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true); SynchronizeStream(outputStream); + + mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; + //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput); + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; + WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); + + /* unsigned int offset = 0; char direction = 1; From 60a53c5d390419a18a8bd9501b488907ccb8e3e8 Mon Sep 17 00:00:00 2001 From: cima22 Date: Mon, 11 Dec 2023 10:03:40 +0100 Subject: [PATCH 16/25] Decoding input from host to device made per internal buffer --- .../Global/GPUChainTrackingCompression.cxx | 75 ++++++++----------- .../Standalone/Benchmark/standalone.cxx | 1 + 2 files changed, 33 insertions(+), 43 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index de7836e7fde0a..10f8565992815 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -234,56 +234,45 @@ int GPUChainTracking::RunTPCDecompression() int outputStream = 0; CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; - char* deviceFlatPts = (char*)inputGPUShadow.qTotU; - GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true); + //char* deviceFlatPts = (char*)inputGPUShadow.qTotU; + //GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true); SynchronizeStream(outputStream); - mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; - //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput); - AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); - processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; - WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); + unsigned int offset = 0; + bool toGPU = true; + GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.flagsU, cmprClsHost.flagsU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.flagsU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU, cmprClsHost.padDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.padDiffU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU, cmprClsHost.timeDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.timeDiffU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU); - /* - unsigned int offset = 0; - char direction = 1; - GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction); + GPUMemCpyAlways(myStep, inputGPUShadow.qTotA, cmprClsHost.qTotA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qTotA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA, cmprClsHost.qMaxA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qMaxA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.flagsA, cmprClsHost.flagsA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.flagsA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA, cmprClsHost.rowDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA, cmprClsHost.sliceLegDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.padResA, cmprClsHost.padResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.padResA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.timeResA, cmprClsHost.timeResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.timeResA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA, cmprClsHost.sigmaPadA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA, cmprClsHost.sigmaTimeA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, toGPU); - for (unsigned int i = 0; i < NSLICES; i++) { - for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { - GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU + offset, cmprClsHost.qMaxU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qMaxU[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.flagsU + offset, cmprClsHost.flagsU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.flagsU[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU + offset, cmprClsHost.padDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.padDiffU[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU + offset, cmprClsHost.timeDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.timeDiffU[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU + offset, cmprClsHost.sigmaPadU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU + offset, cmprClsHost.sigmaTimeU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, direction); - offset += cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; - } - } + GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU); - offset = 0; - for (unsigned int i = 0; i < inputGPUShadow.nTracks; i++) { - GPUMemCpyAlways(myStep, inputGPUShadow.qTotA + offset, cmprClsHost.qTotA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qTotA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA + offset, cmprClsHost.qMaxA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qMaxA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.flagsA + offset, cmprClsHost.flagsA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.flagsA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA + offset, cmprClsHost.sigmaPadA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA + offset, cmprClsHost.sigmaTimeA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, direction); - // First index stored with track - GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA + offset - i, cmprClsHost.rowDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA + offset - i, cmprClsHost.sliceLegDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.padResA + offset - i, cmprClsHost.padResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.padResA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.timeResA + offset - i, cmprClsHost.timeResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.timeResA[0]), outputStream, direction); - offset += cmprClsHost.nTrackClusters[i]; - } - GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, direction); - GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, direction); + mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; + /* + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput); + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; + WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); */ TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 8df8094fe44d3..18fb7e9aeacea 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -948,6 +948,7 @@ int main(int argc, char** argv) printf("Error unregistering memory\n"); } } + exit(0); rec->Exit(); if (!configStandalone.noprompt) { From 636af961502e34773c033813f222241b55780800 Mon Sep 17 00:00:00 2001 From: cima22 Date: Mon, 18 Dec 2023 10:16:20 +0100 Subject: [PATCH 17/25] Created tmp buffers to store native clusters per row and per slice --- GPU/GPUTracking/CMakeLists.txt | 2 +- .../DataCompression/GPUTPCDecompression.cxx | 24 ++++++++++++++++++- .../DataCompression/GPUTPCDecompression.h | 18 +++++++++++++- .../GPUTPCDecompressionKernels.cxx | 22 +++++++++++++++-- .../Global/GPUChainTrackingCompression.cxx | 7 +++--- .../Standalone/Benchmark/standalone.cxx | 2 +- 6 files changed, 66 insertions(+), 9 deletions(-) diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index db2d3f7c0accd..18a187bf5f64d 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -11,7 +11,7 @@ set(MODULE GPUTracking) -# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -O0") # to uncomment if needed, tired of typing this... +set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... include(cmake/helpers.cmake) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 03595069a52f0..22efcea7f774d 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -65,11 +65,33 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign computePointerWithAlignment(mem, c.nTrackClusters, nTr); } +void* GPUTPCDecompression::SetPointersTmpNativeBuffers(void* mem){ + computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT); + computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer); + //computePointerWithAlignment(mem,tmpBuffer,NSLICES * GPUCA_ROW_COUNT); + return mem; +} + void GPUTPCDecompression::RegisterMemoryAllocation() { AllocateAndInitializeLate(); mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput"); + mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffers,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffers"); } void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){ + mMaxNativeClustersPerBuffer = 81760; +} +/* +GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer(): +mCmprClsBuffer{new o2::tpc::ClusterNative[mCapacity]},mIndex{0} +{} -} \ No newline at end of file +void GPUTPCDecompression::ConcurrentClusterNativeBuffer::push_back(tpc::ClusterNative cluster) +{ + if(mIndex == mCapacity){ + //reallocate? + return; + } + unsigned int current = CAMath::AtomicAdd(mIndex, 1u); + mTmpNativeClusters[current] = cluster; +}*/ \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 42bbb6390f5db..1ec3638429629 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -19,6 +19,7 @@ #include "GPUProcessor.h" #include "GPUCommonMath.h" #include "GPUParam.h" +#include "GPUO2DataTypes.h" #ifdef GPUCA_HAVE_O2HEADERS #include "DataFormatsTPC/CompressedClusters.h" @@ -49,16 +50,31 @@ class GPUTPCDecompression : public GPUProcessor void SetMaxData(const GPUTrackingInOutPointers& io); void* SetPointersInputGPU(void* mem); - void* SetPointersMemory(void* mem); + void* SetPointersTmpNativeBuffers(void* mem); #endif protected: constexpr static unsigned int NSLICES = GPUCA_NSLICES; o2::tpc::CompressedClusters mInputGPU; + unsigned int mMaxNativeClustersPerBuffer; + unsigned int* mNativeClustersIndex; + o2::tpc::ClusterNative* mTmpNativeClusters; +/* class ConcurrentClusterNativeBuffer{ + size_t mIndex; + size_t mCapacity = 10; + o2::tpc::ClusterNative* mCmprClsBuffer; + public: + ConcurrentClusterNativeBuffer(); + void push_back(ClusterNative cluster); + };*/ +// ConcurrentClusterNativeBuffer* tmpBuffer; + template void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); short mMemoryResInputGPU = -1; + + }; } // namespace GPUCA_NAMESPACE::gpu #endif // GPUTPCDECOMPRESSION_H diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 65c6fc0e6587c..bc3426fed134c 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -23,10 +23,28 @@ using namespace o2::tpc; template <> GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) { - GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor; GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; + const GPUParam& GPUrestrict() param = processors.param; + + unsigned int offset = 0, lasti = 0; + const unsigned int maxTime = (param.par.continuousMaxTimeBin + 1) * ClusterNative::scaleTimePacked - 1; + + for (unsigned int i = get_global_id(0); i < cmprClusters.nTracks; i += get_global_size(0)) { + while (lasti < i) { + offset += cmprClusters.nTrackClusters[lasti++]; + } + lasti++; + //decompressTrack(clustersCompressed, param, maxTime, i, offset, clusters, locks); + + } if (!iThread && !iBlock) { - GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU.nAttachedClusters,decompressor.mInputGPU.nUnattachedClusters,decompressor.mInputGPU.nTracks); + GPUInfo("==== on GPU nAttCl = %d, nUnAttCl = %d, nTracks = %d",cmprClusters.nAttachedClusters,cmprClusters.nUnattachedClusters,cmprClusters.nTracks); + GPUInfo("=== sizeof(CluserNative) = %lu", sizeof(ClusterNative)); + /*int * test = new int[10]; + test[0] = 1; + GPUInfo("==== got it %p -- %d",(void*)test,test[0]); + delete[] test;*/ } } \ No newline at end of file diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 10f8565992815..4206ea8bbf5f7 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -233,13 +233,12 @@ int GPUChainTracking::RunTPCDecompression() TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); int outputStream = 0; + bool toGPU = true; CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; //char* deviceFlatPts = (char*)inputGPUShadow.qTotU; //GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true); SynchronizeStream(outputStream); - unsigned int offset = 0; - bool toGPU = true; GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU); GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU); @@ -266,6 +265,8 @@ int GPUChainTracking::RunTPCDecompression() GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, toGPU); GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU); + runKernel(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0])); + mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; /* @@ -292,7 +293,7 @@ int GPUChainTracking::RunTPCDecompression() gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); if (mRec->IsGPU()) { - runKernel({1,1,0}, krnlRunRangeNone, krnlEventNone); + runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 18fb7e9aeacea..04bf75f39bab3 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -948,7 +948,7 @@ int main(int argc, char** argv) printf("Error unregistering memory\n"); } } - exit(0); + //exit(0); rec->Exit(); if (!configStandalone.noprompt) { From 2d82384f7ffdcbeb15cd56cbbfe553a8c65b634a Mon Sep 17 00:00:00 2001 From: cima22 Date: Wed, 3 Jan 2024 22:34:43 +0100 Subject: [PATCH 18/25] Decoding of attached clusters on GPU --- .../DataCompression/GPUTPCDecompression.cxx | 22 +++- .../DataCompression/GPUTPCDecompression.h | 11 +- .../GPUTPCDecompressionKernels.cxx | 118 ++++++++++++++++-- .../GPUTPCDecompressionKernels.h | 23 +++- .../TPCClusterDecompressor.cxx | 6 + .../Definitions/GPUDefGPUParameters.h | 12 +- .../Global/GPUChainTrackingCompression.cxx | 47 +++++-- 7 files changed, 203 insertions(+), 36 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 22efcea7f774d..922bfc120d136 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -65,21 +65,33 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign computePointerWithAlignment(mem, c.nTrackClusters, nTr); } -void* GPUTPCDecompression::SetPointersTmpNativeBuffers(void* mem){ - computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT); +void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){ computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer); - //computePointerWithAlignment(mem,tmpBuffer,NSLICES * GPUCA_ROW_COUNT); + //computePointerWithAlignment(mem,mClusterNativeAccess); + return mem; +} + +void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem){ + computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT); + return mem; +} + +void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem){ + computePointerWithAlignment(mem,mUnattachedClustersOffsets,NSLICES * GPUCA_ROW_COUNT); return mem; } void GPUTPCDecompression::RegisterMemoryAllocation() { AllocateAndInitializeLate(); mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput"); - mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffers,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffers"); + mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersGPU,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffersGPU"); + mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersOutput,GPUMemoryResource::MEMORY_OUTPUT,"TPCDecompressionTmpBuffersOutput"); + mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersInput,GPUMemoryResource::MEMORY_INPUT,"TPCDecompressionTmpBuffersInput"); } void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){ - mMaxNativeClustersPerBuffer = 81760; + //mMaxNativeClustersPerBuffer = 81760; + mMaxNativeClustersPerBuffer = 12000; } /* GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer(): diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 1ec3638429629..6d3225088ed56 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -50,15 +50,21 @@ class GPUTPCDecompression : public GPUProcessor void SetMaxData(const GPUTrackingInOutPointers& io); void* SetPointersInputGPU(void* mem); - void* SetPointersTmpNativeBuffers(void* mem); + void* SetPointersTmpNativeBuffersGPU(void* mem); + void* SetPointersTmpNativeBuffersOutput(void* mem); + void* SetPointersTmpNativeBuffersInput(void* mem); + #endif protected: constexpr static unsigned int NSLICES = GPUCA_NSLICES; o2::tpc::CompressedClusters mInputGPU; + unsigned int mMaxNativeClustersPerBuffer; unsigned int* mNativeClustersIndex; + unsigned int* mUnattachedClustersOffsets; o2::tpc::ClusterNative* mTmpNativeClusters; + o2::tpc::ClusterNativeAccess* mClusterNativeAccess; /* class ConcurrentClusterNativeBuffer{ size_t mIndex; size_t mCapacity = 10; @@ -73,7 +79,8 @@ class GPUTPCDecompression : public GPUProcessor void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); short mMemoryResInputGPU = -1; - + short mResourceTmpIndexes = -1; + short mResourceTmpClustersOffsets = -1; }; } // namespace GPUCA_NAMESPACE::gpu diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index bc3426fed134c..6867286d25e64 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -15,14 +15,13 @@ #include "GPUTPCDecompressionKernels.h" #include "GPULogging.h" #include "GPUConstantMem.h" - +#include "GPUTPCCompressionTrackModel.h" using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; template <> -GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors) -{ +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; const GPUParam& GPUrestrict() param = processors.param; @@ -35,16 +34,109 @@ GPUdii() void GPUTPCDecompressionKernels::Thread= GPUCA_NSLICES); + if(changeLeg){ + tmpSlice -= GPUCA_NSLICES; + } + if(cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences){ + slice += tmpSlice; + if(slice >= GPUCA_NSLICES){ + slice -= GPUCA_NSLICES; + } + row += cmprClusters.rowDiffA[clusterOffset -trackIndex -1]; + if(row >= GPUCA_ROW_COUNT){ + row -= GPUCA_ROW_COUNT; + } + } else { + slice = tmpSlice; + row = cmprClusters.rowDiffA[clusterOffset -trackIndex -1]; + } + if (changeLeg && track.Mirror()) { + break; + } + if (track.Propagate(param.tpcGeometry.Row2X(row),param.SliceParam[slice].Alpha)){ + break; + } + unsigned int timeTmp = cmprClusters.timeResA[clusterOffset -trackIndex -1]; + if (timeTmp & 800000) { + timeTmp |= 0xFF000000; + } + time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f,param.tpcGeometry.LinearZ2Time(slice,track.Z() + zOffset))); + float tmpPad = CAMath::Max(0.f, CAMath::Min((float)param.tpcGeometry.NPads(GPUCA_ROW_COUNT - 1), param.tpcGeometry.LinearY2Pad(slice, row, track.Y()))); + pad = cmprClusters.padResA[clusterOffset -trackIndex - 1] + ClusterNative::packPad(tmpPad); + time = time & 0xFFFFFF; + pad = (unsigned short)pad; + if (pad >= param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked) { + if (pad >= 0xFFFF - 11968) { // Constant 11968 = (2^15 - MAX_PADS(138) * scalePadPacked(64)) / 2 + pad = 0; + } else { + pad = param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked - 1; + } + } + if (param.par.continuousMaxTimeBin > 0 && time >= maxTime) { + if (time >= 0xFFFFFF - 544768) { // Constant 544768 = (2^23 - LHCMAXBUNCHES(3564) * MAXORBITS(256) * scaleTimePacked(64) / BCPERTIMEBIN(8)) / 2) + time = 0; + } else { + time = maxTime; + } + } + } else { + time = cmprClusters.timeA[trackIndex]; + pad = cmprClusters.padA[trackIndex]; + } + bool stored; + const auto cluster = decompressTrackStore(cmprClusters, clusterOffset, slice, row, pad, time, decompressor, stored); + float y = param.tpcGeometry.LinearPad2Y(slice, row, cluster.getPad()); + float z = param.tpcGeometry.LinearTime2Z(slice, cluster.getTime()); + if(clusterIndex == 0){ + zOffset = z; + track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex],param); + } + if(clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y,z-zOffset,row)){ + break; + } + clusterOffset++; } + clusterOffset += cmprClusters.nTrackClusters[trackIndex] - clusterIndex; +} -} \ No newline at end of file +GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored){ + unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer); + unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row),1u); + const ClusterNative c(time, cmprClusters.flagsA[clusterOffset], pad, cmprClusters.sigmaTimeA[clusterOffset], cmprClusters.sigmaPadA[clusterOffset], cmprClusters.qMaxA[clusterOffset], cmprClusters.qTotA[clusterOffset]); + stored = currentClusterIndex < decompressor.mMaxNativeClustersPerBuffer; + if(stored){ + decompressor.mTmpNativeClusters[tmpBufferIndex + currentClusterIndex] = c; + } + return c; +} +/* +template <> +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ + for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){ + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; + o2::tpc::ClusterNativeAccess& clustersNative = *decompressor.mClusterNativeAccess; + unsigned int slice = i / GPUCA_ROW_COUNT; + unsigned int row = i % GPUCA_ROW_COUNT; + unsigned int unattachedOffset = (i >= cmprClusters.nSliceRows) ? 0 : cmprClusters.nSliceRowClusters[i]; + (clustersNative.nClusters)[slice][row] = decompressor.mNativeClustersIndex[i] + unattachedOffset; + for(unsigned int j = i+1; j < GPUCA_NSLICES * GPUCA_ROW_COUNT; j++){ + CAMath::AtomicAdd(decompressor.mUnattachedClustersOffsets + j,unattachedOffset); + } + } +}*/ diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h index 6be320026866a..3fd235390c6a7 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -16,6 +16,19 @@ #define GPUTPCDECOMPRESSIONKERNELS_H #include "GPUGeneralKernels.h" +#include "GPUO2DataTypes.h" +#include "GPUParam.h" +#include "GPUConstantMem.h" + +#ifdef GPUCA_HAVE_O2HEADERS +#include "DataFormatsTPC/CompressedClusters.h" +#else +namespace o2::tpc +{ +struct CompressedClusters { +}; +} // namespace o2::tpc +#endif namespace GPUCA_NAMESPACE::gpu { @@ -26,11 +39,19 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCDecompression; } enum K : int { - test = 0, + step0attached = 0, + step1unattached = 1, + prepareAccess = 2 }; template GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors); + GPUd() static void decompressTrack(o2::tpc::CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor); + GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored); + + GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){ + return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer; + } }; } diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx index b8f491a6f5767..bd6714433161a 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx @@ -77,6 +77,12 @@ int TPCClusterDecompressor::decompress(const CompressedClusters* clustersCompres decodedAttachedClusters += clusters[i][j].size(); } } + unsigned int avgAttCl = 0; + for(unsigned int i = 0; i < NSLICES; i++) + for (unsigned int j=0; j < GPUCA_ROW_COUNT; j++) + avgAttCl += clusters[i][j].size(); + avgAttCl/=(NSLICES*GPUCA_ROW_COUNT); + LOGP(info,"===== avg on Host: {}",avgAttCl); if (decodedAttachedClusters != clustersCompressed->nAttachedClusters) { GPUWarning("%u / %u clusters failed track model decoding (%f %%)", clustersCompressed->nAttachedClusters - decodedAttachedClusters, clustersCompressed->nAttachedClusters, 100.f * (float)(clustersCompressed->nAttachedClusters - decodedAttachedClusters) / (float)clustersCompressed->nAttachedClusters); } diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index d88fc055099fb..b3cc4d8dfe29f 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -71,7 +71,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 - #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -136,7 +136,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 - #define GPUCA_LB_GPUTPCDecompressionKernels_test 192, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 192, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -200,7 +200,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256 #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 - #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448 @@ -265,7 +265,7 @@ #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256 #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256 #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128 - #define GPUCA_LB_GPUTPCDecompressionKernels_test 128 + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 @@ -326,8 +326,8 @@ #ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 256 #endif - #ifndef GPUCA_LB_GPUTPCDecompressionKernels_test - #define GPUCA_LB_GPUTPCDecompressionKernels_test 256 + #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step0attached + #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 256 #endif #ifndef GPUCA_LB_GPUTPCCFDecodeZS #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4 diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 4206ea8bbf5f7..1198d9696f417 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -230,15 +230,15 @@ int GPUChainTracking::RunTPCDecompression() size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU); WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); - TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); + //TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); int outputStream = 0; bool toGPU = true; CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU; - //char* deviceFlatPts = (char*)inputGPUShadow.qTotU; - //GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true); SynchronizeStream(outputStream); + GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU); + GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, cmprClsHost.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, toGPU); GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU); GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU); @@ -266,19 +266,50 @@ int GPUChainTracking::RunTPCDecompression() GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU); runKernel(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0])); - + runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters; - /* + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput); AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, 0); + LOGP(info,"==== Host {} -- Shadow {}",(void*)Decompressor.mNativeClustersIndex,(void*)DecompressorShadow.mNativeClustersIndex); + SynchronizeStream(0); + + unsigned int offset = 0; + unsigned int decodedAttachedClusters = 0; + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { + unsigned int linearIndex = i * GPUCA_ROW_COUNT + j; + unsigned int unattachedOffset = (linearIndex >= cmprClsHost.nSliceRows) ? 0 : cmprClsHost.nSliceRowClusters[linearIndex]; + (mClusterNativeAccess->nClusters)[i][j] = Decompressor.mNativeClustersIndex[linearIndex] + unattachedOffset; + Decompressor.mUnattachedClustersOffsets[linearIndex] = offset; + offset += unattachedOffset; + decodedAttachedClusters += Decompressor.mNativeClustersIndex[linearIndex]; + } + } + LOGP(info,"decoded = {}",decodedAttachedClusters); + if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) { + GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); + }/* + mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; + mClusterNativeAccess->setOffsetPtrs(); + mIOPtrs.clustersNative = mClusterNativeAccess.get(); + *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0); + mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput; + mClusterNativeAccess->setOffsetPtrs(); + *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; + processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess; */ + + TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size; - this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); + //this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]); return this->mInputsHost->mPclusterNativeOutput; }; auto& gatherTimer = getTimer("TPCDecompression", 0); @@ -293,9 +324,7 @@ int GPUChainTracking::RunTPCDecompression() gatherTimer.Stop(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); if (mRec->IsGPU()) { - runKernel(GetGridAuto(0), krnlRunRangeNone, krnlEventNone); - - AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; From eee0fa513787d51dadf1ea3f24269697bd77c4b6 Mon Sep 17 00:00:00 2001 From: cima22 Date: Thu, 4 Jan 2024 12:28:34 +0100 Subject: [PATCH 19/25] removed debugging info in CPU decompressor --- .../DataCompression/TPCClusterDecompressor.cxx | 6 ------ GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx | 8 +++++--- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx index bd6714433161a..b8f491a6f5767 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx @@ -77,12 +77,6 @@ int TPCClusterDecompressor::decompress(const CompressedClusters* clustersCompres decodedAttachedClusters += clusters[i][j].size(); } } - unsigned int avgAttCl = 0; - for(unsigned int i = 0; i < NSLICES; i++) - for (unsigned int j=0; j < GPUCA_ROW_COUNT; j++) - avgAttCl += clusters[i][j].size(); - avgAttCl/=(NSLICES*GPUCA_ROW_COUNT); - LOGP(info,"===== avg on Host: {}",avgAttCl); if (decodedAttachedClusters != clustersCompressed->nAttachedClusters) { GPUWarning("%u / %u clusters failed track model decoding (%f %%)", clustersCompressed->nAttachedClusters - decodedAttachedClusters, clustersCompressed->nAttachedClusters, 100.f * (float)(clustersCompressed->nAttachedClusters - decodedAttachedClusters) / (float)clustersCompressed->nAttachedClusters); } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 1198d9696f417..e3bc029e9a469 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -273,7 +273,6 @@ int GPUChainTracking::RunTPCDecompression() AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput); AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, 0); - LOGP(info,"==== Host {} -- Shadow {}",(void*)Decompressor.mNativeClustersIndex,(void*)DecompressorShadow.mNativeClustersIndex); SynchronizeStream(0); unsigned int offset = 0; @@ -291,9 +290,12 @@ int GPUChainTracking::RunTPCDecompression() LOGP(info,"decoded = {}",decodedAttachedClusters); if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) { GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); - }/* + } else { + GPUInfo("My version: all attached clusters have been decoded"); + } + mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; - mClusterNativeAccess->setOffsetPtrs(); + mClusterNativeAccess->setOffsetPtrs();/* mIOPtrs.clustersNative = mClusterNativeAccess.get(); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; From 5b48d25bc82dab3ef992e63ff129e56e62af14c6 Mon Sep 17 00:00:00 2001 From: cima22 Date: Fri, 5 Jan 2024 10:28:16 +0100 Subject: [PATCH 20/25] kernel for unattached clusters (not working) --- .../GPUTPCDecompressionKernels.cxx | 34 +++++++++++++++++++ .../Definitions/GPUDefGPUParameters.h | 7 ++++ .../Global/GPUChainTrackingCompression.cxx | 5 +-- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 6867286d25e64..cd68af30f844c 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -124,6 +124,40 @@ GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2 } return c; } + +template <> +GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ + GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; + const ClusterNative* GPUrestrict() clusterBuffer = processors.ioPtrs.clustersNative->clustersLinear; + unsigned int* offsets = decompressor.mUnattachedClustersOffsets; + for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){ + unsigned int slice = i / GPUCA_ROW_COUNT; + unsigned int row = i % GPUCA_ROW_COUNT; + unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer); + const ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row]; + if (decompressor.mNativeClustersIndex[i] != 0) { + memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0])); + } + const ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i]; + unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]); + //decompressHits(clustersCompressed, offsets[i][j], end, clout); + if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) { + for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) { + auto& cl = buffer[k]; + float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins; + if (t < 0) { + t = 0; + } + if (processors.param.par.continuousMaxTimeBin > 0 && t > processors.param.par.continuousMaxTimeBin) { + t = processors.param.par.continuousMaxTimeBin; + } + cl.setTime(t); + } + } + //std::sort(buffer, buffer + clustersNative.nClusters[i][j]); + } + +} /* template <> GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index b3cc4d8dfe29f..395bdec043331 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -72,6 +72,7 @@ #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -137,6 +138,7 @@ #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 192, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512 @@ -202,6 +204,7 @@ #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 3 #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 448 @@ -267,6 +270,7 @@ #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128 #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128 #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2 + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2 #define GPUCA_LB_COMPRESSION_GATHER 1024 #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4 #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20 @@ -329,6 +333,9 @@ #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step0attached #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 256 #endif + #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step1unattached + #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 256 + #endif #ifndef GPUCA_LB_GPUTPCCFDecodeZS #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4 #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index e3bc029e9a469..4cf58e390b332 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -288,6 +288,7 @@ int GPUChainTracking::RunTPCDecompression() } } LOGP(info,"decoded = {}",decodedAttachedClusters); + TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression,Decompressor.mResourceTmpClustersOffsets,0); if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) { GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters); } else { @@ -295,7 +296,7 @@ int GPUChainTracking::RunTPCDecompression() } mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; - mClusterNativeAccess->setOffsetPtrs();/* + mClusterNativeAccess->setOffsetPtrs(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; @@ -305,8 +306,8 @@ int GPUChainTracking::RunTPCDecompression() mClusterNativeAccess->setOffsetPtrs(); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess; -*/ + runKernel({1,1,0}, krnlRunRangeNone, krnlEventNone); TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { From 7634c61af6f9be7e17937eaa24e23c954aec94a3 Mon Sep 17 00:00:00 2001 From: cima22 Date: Tue, 9 Jan 2024 17:23:08 +0100 Subject: [PATCH 21/25] Added sorting algorithm --- .../DataCompression/GPUTPCDecompression.h | 1 + .../GPUTPCDecompressionKernels.cxx | 32 ++++++++++++++++--- .../GPUTPCDecompressionKernels.h | 1 + .../Global/GPUChainTrackingCompression.cxx | 13 +++++++- 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index 6d3225088ed56..e4b2cca65c007 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -65,6 +65,7 @@ class GPUTPCDecompression : public GPUProcessor unsigned int* mUnattachedClustersOffsets; o2::tpc::ClusterNative* mTmpNativeClusters; o2::tpc::ClusterNativeAccess* mClusterNativeAccess; + o2::tpc::ClusterNative* mNativeClustersBuffer; /* class ConcurrentClusterNativeBuffer{ size_t mIndex; size_t mCapacity = 10; diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index cd68af30f844c..033a38391dec7 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -16,6 +16,7 @@ #include "GPULogging.h" #include "GPUConstantMem.h" #include "GPUTPCCompressionTrackModel.h" +#include "GPUCommonAlgorithm.h" using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; @@ -128,19 +129,20 @@ GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2 template <> GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor; - const ClusterNative* GPUrestrict() clusterBuffer = processors.ioPtrs.clustersNative->clustersLinear; + CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU; + ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer; unsigned int* offsets = decompressor.mUnattachedClustersOffsets; for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){ unsigned int slice = i / GPUCA_ROW_COUNT; unsigned int row = i % GPUCA_ROW_COUNT; unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer); - const ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row]; + ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row]; if (decompressor.mNativeClustersIndex[i] != 0) { memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0])); } - const ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i]; + ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i]; unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]); - //decompressHits(clustersCompressed, offsets[i][j], end, clout); + decompressHits(cmprClusters, offsets[i], end, clout); if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) { for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) { auto& cl = buffer[k]; @@ -154,10 +156,30 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadnClusters[slice][row]); } } + +GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer){ + unsigned int time = 0; + unsigned short pad = 0; + for (unsigned int k = start; k < end; k++) { + if (cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences) { + unsigned int timeTmp = cmprClusters.timeDiffU[k]; + if (timeTmp & 800000) { + timeTmp |= 0xFF000000; + } + time += timeTmp; + pad += cmprClusters.padDiffU[k]; + } else { + time = cmprClusters.timeDiffU[k]; + pad = cmprClusters.padDiffU[k]; + } + *(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]); + } +} + /* template <> GPUdii() void GPUTPCDecompressionKernels::Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){ diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h index 3fd235390c6a7..e70f7486bebbf 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -48,6 +48,7 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors); GPUd() static void decompressTrack(o2::tpc::CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor); GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored); + GPUdi() static void decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, o2::tpc::ClusterNative* clusterNativeBuffer); GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){ return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 4cf58e390b332..f924602baaa5d 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -295,19 +295,26 @@ int GPUChainTracking::RunTPCDecompression() GPUInfo("My version: all attached clusters have been decoded"); } + Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput; + DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer; mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; mClusterNativeAccess->setOffsetPtrs(); mIOPtrs.clustersNative = mClusterNativeAccess.get(); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); + WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0); mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput; mClusterNativeAccess->setOffsetPtrs(); *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess; - runKernel({1,1,0}, krnlRunRangeNone, krnlEventNone); + runKernel(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone); + + ClusterNative* tmpBuffer = new ClusterNative[mInputsHost->mNClusterNative]; + //GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); + GPUMemCpy(RecoStep::TPCDecompression,tmpBuffer,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { @@ -337,6 +344,10 @@ int GPUChainTracking::RunTPCDecompression() TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0); SynchronizeStream(0); } + + LOGP(info,"==== My version/cpu version: {}/{}", tmpBuffer[0].qTot,mInputsHost->mPclusterNativeOutput[0].qTot); + + delete[] tmpBuffer; // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR")); //#endif return 0; From 19cbc626dafaeb9df5307bec7aff6593ad2c4a81 Mon Sep 17 00:00:00 2001 From: cima22 Date: Thu, 25 Jan 2024 10:07:21 +0100 Subject: [PATCH 22/25] hard-coded tests -- need to remove after validaiton --- .../include/DataFormatsTPC/ClusterNative.h | 11 +++ GPU/CMakeLists.txt | 4 +- GPU/GPUTracking/CMakeLists.txt | 2 +- .../DataCompression/GPUTPCDecompression.h | 9 --- .../GPUTPCDecompressionKernels.cxx | 8 ++- .../Global/GPUChainTrackingCompression.cxx | 67 +++++++++++++++++-- .../Standalone/Benchmark/standalone.cxx | 2 +- 7 files changed, 81 insertions(+), 22 deletions(-) diff --git a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h index 49d61007092a8..c5e103362f0f4 100644 --- a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h +++ b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h @@ -156,6 +156,17 @@ struct ClusterNative { return (this->getFlags() < rhs.getFlags()); } } + + GPUd() bool operator==(const ClusterNative& rhs) const + { + return this->getTimePacked() == rhs.getTimePacked() && + this->padPacked == rhs.padPacked && + this->sigmaTimePacked == rhs.sigmaTimePacked && + this->sigmaPadPacked == rhs.sigmaPadPacked && + this->qMax == rhs.qMax && + this->qTot == rhs.qTot && + this->getFlags() == rhs.getFlags(); + } }; // This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index 6e08e9582761a..308c9f161c90f 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -18,8 +18,10 @@ # HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers # for installation only +set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... + if(NOT DEFINED GPUCA_NO_FAST_MATH) - set(GPUCA_NO_FAST_MATH 0) + set(GPUCA_NO_FAST_MATH 1) endif() if(${GPUCA_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math -ffp-contract=off") diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 18a187bf5f64d..200ac832a433e 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -11,7 +11,7 @@ set(MODULE GPUTracking) -set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... +# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... include(cmake/helpers.cmake) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h index e4b2cca65c007..d2cb749dd79c3 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h @@ -66,15 +66,6 @@ class GPUTPCDecompression : public GPUProcessor o2::tpc::ClusterNative* mTmpNativeClusters; o2::tpc::ClusterNativeAccess* mClusterNativeAccess; o2::tpc::ClusterNative* mNativeClustersBuffer; -/* class ConcurrentClusterNativeBuffer{ - size_t mIndex; - size_t mCapacity = 10; - o2::tpc::ClusterNative* mCmprClsBuffer; - public: - ConcurrentClusterNativeBuffer(); - void push_back(ClusterNative cluster); - };*/ -// ConcurrentClusterNativeBuffer* tmpBuffer; template void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA); diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index 033a38391dec7..f222350d574c7 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -131,12 +131,14 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadclusterOffset[slice][row]; + ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[slice][row]; if (decompressor.mNativeClustersIndex[i] != 0) { memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0])); } @@ -144,7 +146,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]); decompressHits(cmprClusters, offsets[i], end, clout); if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) { - for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) { + for (unsigned int k = 0; k < outputAccess->nClusters[slice][row]; k++) { auto& cl = buffer[k]; float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins; if (t < 0) { @@ -156,7 +158,7 @@ GPUdii() void GPUTPCDecompressionKernels::ThreadnClusters[slice][row]); + GPUCommonAlgorithm::sort(buffer, buffer + outputAccess->nClusters[slice][row]); } } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index f924602baaa5d..3bfaf33f0dbba 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -29,7 +29,7 @@ using namespace o2::tpc; int GPUChainTracking::RunTPCCompression() { LOGP(info, "====== Compression"); -//#ifdef GPUCA_HAVE_O2HEADERS +#ifdef GPUCA_HAVE_O2HEADERS mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR")); RecoStep myStep = RecoStep::TPCCompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCCompression; @@ -200,7 +200,7 @@ int GPUChainTracking::RunTPCCompression() ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec); } mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR")); -//#endif +#endif return 0; } @@ -208,8 +208,14 @@ int GPUChainTracking::RunTPCDecompression() { LOGP(info, "====== Decompression"); -//#ifdef GPUCA_HAVE_O2HEADERS +#ifdef GPUCA_HAVE_O2HEADERS // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); + ClusterNativeAccess* original = new ClusterNativeAccess; + original->clustersLinear = new ClusterNative[mIOPtrs.clustersNative->nClustersTotal]; + memcpy((void*)original->clustersLinear,mIOPtrs.clustersNative->clustersLinear,mIOPtrs.clustersNative->nClustersTotal*sizeof(mIOPtrs.clustersNative->clustersLinear[0])); + memcpy((void*)original->nClusters,mIOPtrs.clustersNative->nClusters,NSLICES*GPUCA_ROW_COUNT*sizeof(mIOPtrs.clustersNative->nClusters[0][0])); + original->setOffsetPtrs(); + RecoStep myStep = RecoStep::TPCDecompression; bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true GPUTPCDecompression& Decompressor = processors()->tpcDecompressor; @@ -228,9 +234,14 @@ int GPUChainTracking::RunTPCDecompression() inputGPU.maxTimeBin = param().par.continuousMaxTimeBin; SetupGPUProcessor(&Decompressor, true); + for(int k = 0; k < cmprClsHost.nSliceRowClusters[0]; k++){ + if(cmprClsHost.sigmaTimeU[k] == 14 && cmprClsHost.sigmaPadU[k] == 14 && cmprClsHost.qMaxU[k] == 88 && cmprClsHost.qTotU[k] == 160){ + LOGP(info,"==== Cluster unattached [{}] sigmaTimeU: {} simgaPadU: {} qMaxU: {} qTotU: {}",k,cmprClsHost.sigmaTimeU[k],cmprClsHost.sigmaPadU[k],cmprClsHost.qMaxU[k],cmprClsHost.qTotU[k]); + } + } + size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU); WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); - //TransferMemoryResourcesToGPU(myStep, &Decompressor, 0); int outputStream = 0; bool toGPU = true; @@ -313,8 +324,11 @@ int GPUChainTracking::RunTPCDecompression() runKernel(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone); ClusterNative* tmpBuffer = new ClusterNative[mInputsHost->mNClusterNative]; + ClusterNativeAccess gpuBuffer = *mInputsHost->mPclusterNativeAccess; + gpuBuffer.clustersLinear = tmpBuffer; //GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); GPUMemCpy(RecoStep::TPCDecompression,tmpBuffer,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false); + gpuBuffer.setOffsetPtrs(); TPCClusterDecompressor decomp; auto allocator = [this](size_t size) { @@ -345,10 +359,49 @@ int GPUChainTracking::RunTPCDecompression() SynchronizeStream(0); } - LOGP(info,"==== My version/cpu version: {}/{}", tmpBuffer[0].qTot,mInputsHost->mPclusterNativeOutput[0].qTot); - + const ClusterNativeAccess* decoded = &gpuBuffer; //mIOPtrs.clustersNative; + // original = (ClusterNativeAccess*)mIOPtrs.clustersNative; + unsigned int decodingErrors = 0; + std::vector tmpClusters; + if (param().rec.tpc.rejectionStrategy == GPUSettings::RejectionNone) { // verification does not make sense if we reject clusters during compression + for (unsigned int i = 0; i < NSLICES; i++) { + for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) { + if (original->nClusters[i][j] != decoded->nClusters[i][j]) { + GPUError("Number of clusters mismatch slice %u row %u: expected %d v.s. decoded %d", i, j, original->nClusters[i][j], decoded->nClusters[i][j]); + decodingErrors++; + continue; + } + tmpClusters.resize(original->nClusters[i][j]); + for (unsigned int k = 0; k < original->nClusters[i][j]; k++) { + tmpClusters[k] = original->clusters[i][j][k]; + if (param().rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) { + GPUTPCCompression::truncateSignificantBitsChargeMax(tmpClusters[k].qMax, param()); + GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param()); + GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaPadPacked, param()); + GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param()); + } + } + std::sort(tmpClusters.begin(), tmpClusters.end()); + for (unsigned int k = 0; k < original->nClusters[i][j]; k++) { + const o2::tpc::ClusterNative& c1 = tmpClusters[k]; + const o2::tpc::ClusterNative& c2 = decoded->clusters[i][j][k]; + if (!(c1 == c2)) { + if (decodingErrors++ < 100) { + GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot); + GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot); + } + } + } + } + } + if (decodingErrors) { + GPUWarning("Errors during cluster decoding %u\n", decodingErrors); + } else { + GPUInfo("Cluster decoding verification: PASSED"); + } + } delete[] tmpBuffer; // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR")); -//#endif +#endif return 0; } \ No newline at end of file diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 04bf75f39bab3..5805bc386ab2e 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -667,7 +667,7 @@ int RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingUse, chainTrackingAsync->mIOPtrs.rawClusters[i] = nullptr; chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0; } - chainTrackingAsync->mIOPtrs.clustersNative = nullptr; + chainTrackingAsync->mIOPtrs.clustersNative = chainTrackingUse->mIOPtrs.clustersNative; //todo: revert back to nullptr recAsync->SetResetTimers(iRun < configStandalone.runsInit); tmpRetVal = recAsync->RunChains(); if (tmpRetVal == 0 || tmpRetVal == 2) { From 2f773de60daedca34d65031f6a9edfed4249271b Mon Sep 17 00:00:00 2001 From: cima22 Date: Mon, 29 Jan 2024 15:34:05 +0100 Subject: [PATCH 23/25] code cleaning and adapting to new kernel registration --- .../DataCompression/GPUTPCDecompression.cxx | 17 +---------------- .../GPUTPCDecompressionKernels.cxx | 1 + .../GPUTPCDecompressionKernels.h | 2 +- .../Global/GPUChainTrackingCompression.cxx | 3 --- GPU/GPUTracking/kernels.cmake | 2 ++ 5 files changed, 5 insertions(+), 20 deletions(-) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx index 922bfc120d136..c70422a206bf2 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx @@ -67,7 +67,6 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){ computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer); - //computePointerWithAlignment(mem,mClusterNativeAccess); return mem; } @@ -92,18 +91,4 @@ void GPUTPCDecompression::RegisterMemoryAllocation() { void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){ //mMaxNativeClustersPerBuffer = 81760; mMaxNativeClustersPerBuffer = 12000; -} -/* -GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer(): -mCmprClsBuffer{new o2::tpc::ClusterNative[mCapacity]},mIndex{0} -{} - -void GPUTPCDecompression::ConcurrentClusterNativeBuffer::push_back(tpc::ClusterNative cluster) -{ - if(mIndex == mCapacity){ - //reallocate? - return; - } - unsigned int current = CAMath::AtomicAdd(mIndex, 1u); - mTmpNativeClusters[current] = cluster; -}*/ \ No newline at end of file +} \ No newline at end of file diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx index f222350d574c7..1a726d00ebcd2 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx @@ -17,6 +17,7 @@ #include "GPUConstantMem.h" #include "GPUTPCCompressionTrackModel.h" #include "GPUCommonAlgorithm.h" +#include using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h index e70f7486bebbf..82041d05fcb7c 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h +++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h @@ -41,7 +41,7 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate enum K : int { step0attached = 0, step1unattached = 1, - prepareAccess = 2 + //prepareAccess = 2 }; template diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 3bfaf33f0dbba..9a384f4ce9d8c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -338,9 +338,6 @@ int GPUChainTracking::RunTPCDecompression() }; auto& gatherTimer = getTimer("TPCDecompression", 0); gatherTimer.Start(); - - LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",cmprClsHost.nAttachedClusters,cmprClsHost.nUnattachedClusters,cmprClsHost.nTracks); - if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) { GPUError("Error decompressing clusters"); return 1; diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index b06749010d2d9..d7302f17271d2 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -73,6 +73,8 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered32" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128" LB simple) o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" LB simple) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" LB simple) o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" LB single) o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits" LB single) From d1d648aa6e0cefab9968de9efd965fae0ae6c137 Mon Sep 17 00:00:00 2001 From: cima22 Date: Tue, 30 Jan 2024 10:39:32 +0100 Subject: [PATCH 24/25] Removed debugging messages and added no-fast-math option --- GPU/CMakeLists.txt | 2 +- GPU/GPUTracking/CMakeLists.txt | 1 + .../Global/GPUChainTrackingCompression.cxx | 13 ++----------- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt index 308c9f161c90f..a3232b454d5ba 100644 --- a/GPU/CMakeLists.txt +++ b/GPU/CMakeLists.txt @@ -18,7 +18,7 @@ # HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers # for installation only -set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... +#set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this... if(NOT DEFINED GPUCA_NO_FAST_MATH) set(GPUCA_NO_FAST_MATH 1) diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 200ac832a433e..9973720250ad8 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -471,6 +471,7 @@ endif() set_source_files_properties(DataCompression/GPUTPCCompressionTrackModel.cxx DataCompression/GPUTPCCompressionKernels.cxx DataCompression/TPCClusterDecompressor.cxx + DataCompression/GPUTPCDecompressionKernels.cxx TARGET_DIRECTORY ${targetName} PROPERTIES COMPILE_FLAGS "-fno-fast-math -ffp-contract=off") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index 9a384f4ce9d8c..e0a4ba3ccd8a2 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -28,7 +28,6 @@ using namespace o2::tpc; int GPUChainTracking::RunTPCCompression() { - LOGP(info, "====== Compression"); #ifdef GPUCA_HAVE_O2HEADERS mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR")); RecoStep myStep = RecoStep::TPCCompression; @@ -206,8 +205,6 @@ int GPUChainTracking::RunTPCCompression() int GPUChainTracking::RunTPCDecompression() { - LOGP(info, "====== Decompression"); - #ifdef GPUCA_HAVE_O2HEADERS // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR")); ClusterNativeAccess* original = new ClusterNativeAccess; @@ -234,12 +231,6 @@ int GPUChainTracking::RunTPCDecompression() inputGPU.maxTimeBin = param().par.continuousMaxTimeBin; SetupGPUProcessor(&Decompressor, true); - for(int k = 0; k < cmprClsHost.nSliceRowClusters[0]; k++){ - if(cmprClsHost.sigmaTimeU[k] == 14 && cmprClsHost.sigmaPadU[k] == 14 && cmprClsHost.qMaxU[k] == 88 && cmprClsHost.qTotU[k] == 160){ - LOGP(info,"==== Cluster unattached [{}] sigmaTimeU: {} simgaPadU: {} qMaxU: {} qTotU: {}",k,cmprClsHost.sigmaTimeU[k],cmprClsHost.sigmaPadU[k],cmprClsHost.qMaxU[k],cmprClsHost.qTotU[k]); - } - } - size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU); WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0); @@ -384,8 +375,8 @@ int GPUChainTracking::RunTPCDecompression() const o2::tpc::ClusterNative& c2 = decoded->clusters[i][j][k]; if (!(c1 == c2)) { if (decodingErrors++ < 100) { - GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot); - GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot); + //GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot); + //GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot); } } } From f0249effd4cab2041abe3e49984f007d016ff761 Mon Sep 17 00:00:00 2001 From: cima22 Date: Tue, 30 Jan 2024 10:50:43 +0100 Subject: [PATCH 25/25] removed unnecessary files --- .../.cmake/api/v1/query/cache-v2 | 0 .../.cmake/api/v1/query/cmakeFiles-v1 | 0 .../.cmake/api/v1/query/codemodel-v2 | 0 .../.cmake/api/v1/query/toolchains-v1 | 0 cmake-build-debug/DartConfiguration.tcl | 106 ------------------ 5 files changed, 106 deletions(-) delete mode 100644 cmake-build-debug/.cmake/api/v1/query/cache-v2 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/codemodel-v2 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/toolchains-v1 delete mode 100644 cmake-build-debug/DartConfiguration.tcl diff --git a/cmake-build-debug/.cmake/api/v1/query/cache-v2 b/cmake-build-debug/.cmake/api/v1/query/cache-v2 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 b/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 b/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 b/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/cmake-build-debug/DartConfiguration.tcl b/cmake-build-debug/DartConfiguration.tcl deleted file mode 100644 index 09b3136cfee5f..0000000000000 --- a/cmake-build-debug/DartConfiguration.tcl +++ /dev/null @@ -1,106 +0,0 @@ -# This file is configured by CMake automatically as DartConfiguration.tcl -# If you choose not to use CMake, this file may be hand configured, by -# filling in the required variables. - - -# Configuration directories and files -SourceDirectory: /experiments/alice/cimag/o2/O2 -BuildDirectory: /experiments/alice/cimag/o2/O2/cmake-build-debug - -# Where to place the cost data store -CostDataFile: - -# Site is something like machine.domain, i.e. pragmatic.crd -Site: gr3srv.ts.infn.it - -# Build name is osname-revision-compiler, i.e. Linux-2.4.2-2smp-c++ -BuildName: Linux-c++ - -# Subprojects -LabelsForSubprojects: - -# Submission information -SubmitURL: http:// -SubmitInactivityTimeout: - -# Dashboard start time -NightlyStartTime: 00:00:00 EDT - -# Commands for the build/test/submit cycle -ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2" -MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}" -DefaultCTestConfigurationType: Release - -# version control -UpdateVersionOnly: - -# CVS options -# Default is "-d -P -A" -CVSCommand: -CVSUpdateOptions: - -# Subversion options -SVNCommand: -SVNOptions: -SVNUpdateOptions: - -# Git options -GITCommand: /usr/bin/git -GITInitSubmodules: -GITUpdateOptions: -GITUpdateCustom: - -# Perforce options -P4Command: -P4Client: -P4Options: -P4UpdateOptions: -P4UpdateCustom: - -# Generic update command -UpdateCommand: /usr/bin/git -UpdateOptions: -UpdateType: git - -# Compiler info -Compiler: /usr/bin/c++ -CompilerVersion: 4.8.5 - -# Dynamic analysis (MemCheck) -PurifyCommand: -ValgrindCommand: -ValgrindCommandOptions: -DrMemoryCommand: -DrMemoryCommandOptions: -CudaSanitizerCommand: -CudaSanitizerCommandOptions: -MemoryCheckType: -MemoryCheckSanitizerOptions: -MemoryCheckCommand: /usr/bin/valgrind -MemoryCheckCommandOptions: -MemoryCheckSuppressionFile: - -# Coverage -CoverageCommand: /usr/bin/gcov -CoverageExtraFlags: -l - -# Testing options -# TimeOut is the amount of time in seconds to wait for processes -# to complete during testing. After TimeOut seconds, the -# process will be summarily terminated. -# Currently set to 25 minutes -TimeOut: 1500 - -# During parallel testing CTest will not start a new test if doing -# so would cause the system load to exceed this value. -TestLoad: - -UseLaunchers: -CurlOptions: -# warning, if you add new options here that have to do with submit, -# you have to update cmCTestSubmitCommand.cxx - -# For CTest submissions that timeout, these options -# specify behavior for retrying the submission -CTestSubmitRetryDelay: 5 -CTestSubmitRetryCount: 3