From 78b07b8949dd33cfc2a6a513b6466f6b97f6a602 Mon Sep 17 00:00:00 2001
From: Gabriele Cimador <cimag@gr3srv.ts.infn.it>
Date: Thu, 16 Nov 2023 09:53:14 +0100
Subject: [PATCH 01/25] test

---
 .../Global/GPUChainTrackingCompression.cxx    |   2 +-
 .../.cmake/api/v1/query/cache-v2              |   0
 .../.cmake/api/v1/query/cmakeFiles-v1         |   0
 .../.cmake/api/v1/query/codemodel-v2          |   0
 .../.cmake/api/v1/query/toolchains-v1         |   0
 cmake-build-debug/DartConfiguration.tcl       | 106 ++++++++++++++++++
 6 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 cmake-build-debug/.cmake/api/v1/query/cache-v2
 create mode 100644 cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1
 create mode 100644 cmake-build-debug/.cmake/api/v1/query/codemodel-v2
 create mode 100644 cmake-build-debug/.cmake/api/v1/query/toolchains-v1
 create mode 100644 cmake-build-debug/DartConfiguration.tcl

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 3c622751acad1..edbbf28be8929 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -22,7 +22,7 @@
 #include "TPCClusterDecompressor.h"
 #endif
 #include "utils/strtag.h"
-
+//prova
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
 
diff --git a/cmake-build-debug/.cmake/api/v1/query/cache-v2 b/cmake-build-debug/.cmake/api/v1/query/cache-v2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 b/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 b/cmake-build-debug/.cmake/api/v1/query/codemodel-v2
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 b/cmake-build-debug/.cmake/api/v1/query/toolchains-v1
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/cmake-build-debug/DartConfiguration.tcl b/cmake-build-debug/DartConfiguration.tcl
new file mode 100644
index 0000000000000..2077a80fa68de
--- /dev/null
+++ b/cmake-build-debug/DartConfiguration.tcl
@@ -0,0 +1,106 @@
+# This file is configured by CMake automatically as DartConfiguration.tcl
+# If you choose not to use CMake, this file may be hand configured, by
+# filling in the required variables.
+
+
+# Configuration directories and files
+SourceDirectory: /experiments/alice/cimag/o2/O2
+BuildDirectory: /experiments/alice/cimag/o2/O2/cmake-build-debug
+
+# Where to place the cost data store
+CostDataFile: 
+
+# Site is something like machine.domain, i.e. pragmatic.crd
+Site: gr3srv.ts.infn.it
+
+# Build name is osname-revision-compiler, i.e. Linux-2.4.2-2smp-c++
+BuildName: Linux-c++
+
+# Subprojects
+LabelsForSubprojects: 
+
+# Submission information
+SubmitURL: http://
+SubmitInactivityTimeout: 
+
+# Dashboard start time
+NightlyStartTime: 00:00:00 EDT
+
+# Commands for the build/test/submit cycle
+ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2"
+MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}"
+DefaultCTestConfigurationType: Release
+
+# version control
+UpdateVersionOnly: 
+
+# CVS options
+# Default is "-d -P -A"
+CVSCommand: 
+CVSUpdateOptions: 
+
+# Subversion options
+SVNCommand: 
+SVNOptions: 
+SVNUpdateOptions: 
+
+# Git options
+GITCommand: /usr/bin/git
+GITInitSubmodules: 
+GITUpdateOptions: 
+GITUpdateCustom: 
+
+# Perforce options
+P4Command: 
+P4Client: 
+P4Options: 
+P4UpdateOptions: 
+P4UpdateCustom: 
+
+# Generic update command
+UpdateCommand: /usr/bin/git
+UpdateOptions: 
+UpdateType: git
+
+# Compiler info
+Compiler: /usr/bin/c++
+CompilerVersion: 4.8.5
+
+# Dynamic analysis (MemCheck)
+PurifyCommand: 
+ValgrindCommand: 
+ValgrindCommandOptions: 
+DrMemoryCommand: 
+DrMemoryCommandOptions: 
+CudaSanitizerCommand: 
+CudaSanitizerCommandOptions: 
+MemoryCheckType: 
+MemoryCheckSanitizerOptions: 
+MemoryCheckCommand: /usr/bin/valgrind
+MemoryCheckCommandOptions: 
+MemoryCheckSuppressionFile: 
+
+# Coverage
+CoverageCommand: /usr/bin/gcov
+CoverageExtraFlags: -l
+
+# Testing options
+# TimeOut is the amount of time in seconds to wait for processes
+# to complete during testing.  After TimeOut seconds, the
+# process will be summarily terminated.
+# Currently set to 25 minutes
+TimeOut: 1500
+
+# During parallel testing CTest will not start a new test if doing
+# so would cause the system load to exceed this value.
+TestLoad: 
+
+UseLaunchers: 
+CurlOptions: 
+# warning, if you add new options here that have to do with submit,
+# you have to update cmCTestSubmitCommand.cxx
+
+# For CTest submissions that timeout, these options
+# specify behavior for retrying the submission
+CTestSubmitRetryDelay: 5
+CTestSubmitRetryCount: 3

From 7d4340dac4e6b51f9b01940d2c4f73be92e6b080 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Fri, 17 Nov 2023 09:18:58 +0100
Subject: [PATCH 02/25] test

---
 CMakeLists.txt                                             | 3 +++
 GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf1d83fbb6644..feabbddfcf0f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,9 @@ project(O2 LANGUAGES C CXX VERSION 1.2.0)
 
 include(CTest)
 
+set(CUDA_COMPUTETARGET 75)
+set(GPUCA_NO_FAST_MATH 1)
+
 # Project wide setup
 
 # Would better fit inside GPU/CMakeLists.txt, but include GPU/Common directly
diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
index b8f491a6f5767..8ca42f8e54af6 100644
--- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
+++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
@@ -24,7 +24,7 @@
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
-
+//prova
 int TPCClusterDecompressor::decompress(const CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function<o2::tpc::ClusterNative*(size_t)> allocator, const GPUParam& param)
 {
   CompressedClusters c;

From 31e10d02e2f2db2288d2e50c4a2305f6ee236b2a Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Fri, 17 Nov 2023 09:25:54 +0100
Subject: [PATCH 03/25] test2

---
 GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
index 8ca42f8e54af6..b8f491a6f5767 100644
--- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
+++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
@@ -24,7 +24,7 @@
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
-//prova
+
 int TPCClusterDecompressor::decompress(const CompressedClustersFlat* clustersCompressed, o2::tpc::ClusterNativeAccess& clustersNative, std::function<o2::tpc::ClusterNative*(size_t)> allocator, const GPUParam& param)
 {
   CompressedClusters c;

From ac8042f7a08baa4ca3ef75836f8e97d70a84f1c7 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Fri, 17 Nov 2023 15:16:42 +0100
Subject: [PATCH 04/25] Dummy Kernel Implementation

---
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  1 +
 .../Base/GPUReconstructionIncludesDevice.h    |  3 ++
 GPU/GPUTracking/CMakeLists.txt                |  1 +
 .../GPUTPCDecompressionKernels.cxx            | 25 +++++++++++++
 .../GPUTPCDecompressionKernels.h              | 35 +++++++++++++++++++
 .../Definitions/GPUDefGPUParameters.h         |  7 ++++
 .../Global/GPUChainTrackingCompression.cxx    |  4 ++-
 7 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
 create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h

diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 5b176218984e5..97642734a60b7 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -39,6 +39,7 @@
 #include "GPUITSFitterKernels.h"
 #include "GPUTPCConvertKernel.h"
 #include "GPUTPCCompressionKernels.h"
+#include "GPUTPCDecompressionKernels.h"
 #include "GPUTPCClusterFinderKernels.h"
 #include "GPUTrackingRefitKernel.h"
 #include "GPUTPCGMO2Output.h"
diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
index 0297a1a6cb9d1..81fde1dbfb996 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
@@ -75,6 +75,9 @@ using namespace GPUCA_NAMESPACE::gpu;
 #include "GPUTPCCompressionKernels.cxx"
 #include "GPUTPCCompressionTrackModel.cxx"
 
+//Files for TPC Decompression
+#include "GPUTPCDecompressionKernels.cxx"
+
 // Files for TPC Cluster Finder
 #include "ClusterAccumulator.cxx"
 #include "GPUTPCCFStreamCompaction.cxx"
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 0811bb6d1ad79..390773b11a81e 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -176,6 +176,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
         DataCompression/GPUTPCCompression.cxx
         DataCompression/GPUTPCCompressionTrackModel.cxx
         DataCompression/GPUTPCCompressionKernels.cxx
+        DataCompression/GPUTPCDecompressionKernels.cxx
         DataCompression/TPCClusterDecompressor.cxx
         DataCompression/GPUTPCClusterStatistics.cxx
         TPCClusterFinder/GPUTPCClusterFinder.cxx
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
new file mode 100644
index 0000000000000..693d6663d69a0
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -0,0 +1,25 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompressionKernels.cxx
+/// \author Gabriele Cimador
+
+#include "GPUTPCDecompressionKernels.h"
+#include <stdio.h>
+
+using namespace GPUCA_NAMESPACE::gpu;
+using namespace o2::tpc;
+
+template <>
+GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::test>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
+{
+  printf("Hello world!");
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
new file mode 100644
index 0000000000000..a9a8f70dcba1e
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
@@ -0,0 +1,35 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompressionKernels.h
+/// \author Gabriele Cimador
+
+#ifndef GPUTPCDECOMPRESSIONKERNELS_H
+#define GPUTPCDECOMPRESSIONKERNELS_H
+
+#include "GPUGeneralKernels.h"
+
+namespace GPUCA_NAMESPACE::gpu
+{
+
+class GPUTPCDecompressionKernels : public GPUKernelTemplate
+{
+ public:
+  enum K : int {
+    test = 0,
+  };
+
+  template <int iKernel = defaultKernel>
+  GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors);
+};
+
+}
+#endif // GPUTPCDECOMPRESSIONKERNELS_H
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 07f9035329847..d88fc055099fb 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -71,6 +71,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -135,6 +136,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_test 192, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -198,6 +200,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448
@@ -262,6 +265,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128
+  #define GPUCA_LB_GPUTPCDecompressionKernels_test 128
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
@@ -322,6 +326,9 @@
   #ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached
     #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 256
   #endif
+  #ifndef GPUCA_LB_GPUTPCDecompressionKernels_test
+    #define GPUCA_LB_GPUTPCDecompressionKernels_test 256
+  #endif
   #ifndef GPUCA_LB_GPUTPCCFDecodeZS
     #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4
   #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index edbbf28be8929..ecff409ae1522 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -22,7 +22,7 @@
 #include "TPCClusterDecompressor.h"
 #endif
 #include "utils/strtag.h"
-//prova
+
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
 
@@ -223,7 +223,9 @@ int GPUChainTracking::RunTPCDecompression()
   }
   gatherTimer.Stop();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
+  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
   if (mRec->IsGPU()) {
+    runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
     processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
     WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);

From 64c05c4b9384fd18b4416aaee6368a5d056f9bdb Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Sun, 19 Nov 2023 15:24:56 +0100
Subject: [PATCH 05/25] Working dummy kernel

---
 .../DataCompression/GPUTPCDecompressionKernels.cxx            | 3 ++-
 GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h  | 2 ++
 GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx        | 4 +++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index 693d6663d69a0..ccc289dd793c3 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -14,6 +14,7 @@
 
 #include "GPUTPCDecompressionKernels.h"
 #include <stdio.h>
+#include "GPULogging.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
@@ -21,5 +22,5 @@ using namespace o2::tpc;
 template <>
 GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::test>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
 {
-  printf("Hello world!");
+  //LOGP(info, "===== Hello world!");
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
index a9a8f70dcba1e..6be320026866a 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
@@ -23,6 +23,8 @@ namespace GPUCA_NAMESPACE::gpu
 class GPUTPCDecompressionKernels : public GPUKernelTemplate
 {
  public:
+  GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCDecompression; }
+
   enum K : int {
     test = 0,
   };
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index ecff409ae1522..0901ad0c4576e 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -28,6 +28,7 @@ using namespace o2::tpc;
 
 int GPUChainTracking::RunTPCCompression()
 {
+  LOGP(info, "====== Compression");
 #ifdef GPUCA_HAVE_O2HEADERS
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR"));
   RecoStep myStep = RecoStep::TPCCompression;
@@ -207,6 +208,7 @@ int GPUChainTracking::RunTPCCompression()
 
 int GPUChainTracking::RunTPCDecompression()
 {
+  LOGP(info, "====== Decompression");
 #ifdef GPUCA_HAVE_O2HEADERS
   const auto& threadContext = GetThreadContext();
   TPCClusterDecompressor decomp;
@@ -223,7 +225,7 @@ int GPUChainTracking::RunTPCDecompression()
   }
   gatherTimer.Stop();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
-  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
+  LOGP(info, "====== isGPU: {} ", mRec->IsGPU());
   if (mRec->IsGPU()) {
     runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);

From 5257993bc2cfebf4fe706d318a917f0ba5400ce1 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Mon, 20 Nov 2023 15:28:17 +0100
Subject: [PATCH 06/25] Created GPUTPCDecompressor context and added to
 constant memory model

---
 GPU/GPUTracking/Base/GPUConstantMem.h         |  2 ++
 GPU/GPUTracking/CMakeLists.txt                |  1 +
 .../DataCompression/GPUTPCDecompression.cxx   | 15 ++++++++
 .../DataCompression/GPUTPCDecompression.h     | 34 +++++++++++++++++++
 .../Global/GPUChainTrackingCompression.cxx    |  5 +++
 5 files changed, 57 insertions(+)
 create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
 create mode 100644 GPU/GPUTracking/DataCompression/GPUTPCDecompression.h

diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h
index b9b93100d5bfd..d18e7863e2001 100644
--- a/GPU/GPUTracking/Base/GPUConstantMem.h
+++ b/GPU/GPUTracking/Base/GPUConstantMem.h
@@ -46,6 +46,7 @@ class GPUTRDTracker_t
 #if defined(GPUCA_NOCOMPAT_ALLCINT) && (!defined(GPUCA_GPULIBRARY) || !defined(GPUCA_ALIROOT_LIB)) && defined(GPUCA_HAVE_O2HEADERS)
 #include "GPUTPCConvert.h"
 #include "GPUTPCCompression.h"
+#include "GPUTPCDecompression.h"
 #include "GPUITSFitter.h"
 #include "GPUTPCClusterFinder.h"
 #include "GPUTrackingRefit.h"
@@ -69,6 +70,7 @@ struct GPUConstantMem {
   tpcTrackers[GPUCA_NSLICES];
   GPUTPCConvert tpcConverter;
   GPUTPCCompression tpcCompressor;
+  GPUTPCDecompression tpcDecompressor;
   GPUTPCGMMerger tpcMerger;
   GPUTRDTrackerGPU trdTrackerGPU;
 #ifdef GPUCA_HAVE_O2HEADERS
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 390773b11a81e..db2d3f7c0accd 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -176,6 +176,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
         DataCompression/GPUTPCCompression.cxx
         DataCompression/GPUTPCCompressionTrackModel.cxx
         DataCompression/GPUTPCCompressionKernels.cxx
+        DataCompression/GPUTPCDecompression.cxx
         DataCompression/GPUTPCDecompressionKernels.cxx
         DataCompression/TPCClusterDecompressor.cxx
         DataCompression/GPUTPCClusterStatistics.cxx
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
new file mode 100644
index 0000000000000..34a790703fdfc
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -0,0 +1,15 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompression.cxx
+/// \author Gabriele Cimador
+
+#include "GPUTPCDecompression.h"
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
new file mode 100644
index 0000000000000..c7e3ccb1eb762
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -0,0 +1,34 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompression.h
+/// \author Gabriele Cimador
+
+#ifndef GPUTPCDECOMPRESSION_H
+#define GPUTPCDECOMPRESSION_H
+
+#include "GPUDef.h"
+#include "GPUProcessor.h"
+#include "GPUCommonMath.h"
+#include "GPUParam.h"
+
+namespace GPUCA_NAMESPACE::gpu
+{
+class GPUTPCGMMerger;
+
+class GPUTPCDecompression : public GPUProcessor
+{
+  friend class GPUTPCDecmpressionKernels;
+  friend class GPUChainTracking;
+
+};
+}
+#endif // GPUTPCDECOMPRESSION_H
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 0901ad0c4576e..f75f79f3cc271 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -209,7 +209,10 @@ int GPUChainTracking::RunTPCCompression()
 int GPUChainTracking::RunTPCDecompression()
 {
   LOGP(info, "====== Decompression");
+
 #ifdef GPUCA_HAVE_O2HEADERS
+  RecoStep myStep = RecoStep::TPCDecompression;
+  bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true
   const auto& threadContext = GetThreadContext();
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
@@ -241,3 +244,5 @@ int GPUChainTracking::RunTPCDecompression()
 #endif
   return 0;
 }
+
+

From 7405a42e3b647217d86e66fc6a3388d2cce60328 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Wed, 22 Nov 2023 12:04:56 +0100
Subject: [PATCH 07/25] GPUTPCDecompressor now able to move variables to GPU
 version

---
 .../DataCompression/GPUTPCDecompression.cxx      | 16 ++++++++++++++++
 .../DataCompression/GPUTPCDecompression.h        |  9 ++++++++-
 .../GPUTPCDecompressionKernels.cxx               | 12 ++++++++++--
 GPU/GPUTracking/Global/GPUChainTracking.cxx      |  6 ++++++
 .../Global/GPUChainTrackingCompression.cxx       |  8 +++++++-
 5 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index 34a790703fdfc..5a1d3aa84e3ef 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -13,3 +13,19 @@
 /// \author Gabriele Cimador
 
 #include "GPUTPCDecompression.h"
+#include "GPUReconstruction.h"
+#include "GPUO2DataTypes.h"
+#include "GPUMemorySizeScalers.h"
+
+using namespace GPUCA_NAMESPACE::gpu;
+
+void GPUTPCDecompression::InitializeProcessor() {}
+
+void GPUTPCDecompression::RegisterMemoryAllocation()
+{
+}
+
+void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)
+{
+
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index c7e3ccb1eb762..9e4def065121f 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -22,13 +22,20 @@
 
 namespace GPUCA_NAMESPACE::gpu
 {
-class GPUTPCGMMerger;
 
 class GPUTPCDecompression : public GPUProcessor
 {
   friend class GPUTPCDecmpressionKernels;
   friend class GPUChainTracking;
 
+ public:
+  unsigned int test = 42;
+  unsigned int* testP;
+#ifndef GPUCA_GPUCODE
+  void InitializeProcessor();
+  void RegisterMemoryAllocation();
+  void SetMaxData(const GPUTrackingInOutPointers& io);
+#endif
 };
 }
 #endif // GPUTPCDECOMPRESSION_H
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index ccc289dd793c3..9cd1ef7484473 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -13,8 +13,9 @@
 /// \author Gabriele Cimador
 
 #include "GPUTPCDecompressionKernels.h"
-#include <stdio.h>
 #include "GPULogging.h"
+#include "GPUConstantMem.h"
+
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
@@ -22,5 +23,12 @@ using namespace o2::tpc;
 template <>
 GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::test>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
 {
-  //LOGP(info, "===== Hello world!");
+  GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
+  GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
+  unsigned int x = decompressor.test;
+  unsigned int y = compressor.mTest;
+  if (!iThread && !iBlock) {
+    GPUInfo("==== Test: X={%d} Y={%d} \n", x, y);
+  }
+
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 403aa0dae2800..1a0aba3533b4f 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -105,6 +105,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors()
   if (GetRecoSteps() & RecoStep::TPCCompression) {
     mRec->RegisterGPUProcessor(&processors()->tpcCompressor, GetRecoStepsGPU() & RecoStep::TPCCompression);
   }
+  if (GetRecoSteps() & RecoStep::TPCDecompression) {
+    mRec->RegisterGPUProcessor(&processors()->tpcDecompressor, GetRecoStepsGPU() & RecoStep::TPCDecompression);
+  }
   if (GetRecoSteps() & RecoStep::TPCClusterFinding) {
     for (unsigned int i = 0; i < NSLICES; i++) {
       mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
@@ -149,6 +152,9 @@ void GPUChainTracking::RegisterGPUProcessors()
   if (GetRecoStepsGPU() & RecoStep::TPCCompression) {
     mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcCompressor, &processors()->tpcCompressor);
   }
+  if (GetRecoStepsGPU() & RecoStep::TPCDecompression) {
+    mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcDecompressor, &processors()->tpcDecompressor);
+  }
   if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) {
     for (unsigned int i = 0; i < NSLICES; i++) {
       mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index f75f79f3cc271..56c8209ae0d81 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -45,7 +45,6 @@ int GPUChainTracking::RunTPCCompression()
   }
   SetupGPUProcessor(&Compressor, true);
   new (Compressor.mMemory) GPUTPCCompression::memory;
-
   WriteToConstantMemory(myStep, (char*)&processors()->tpcCompressor - (char*)processors(), &CompressorShadow, sizeof(CompressorShadow), 0);
   TransferMemoryResourcesToGPU(myStep, &Compressor, 0);
   runKernel<GPUMemClean16>(GetGridAutoStep(0, RecoStep::TPCCompression), krnlRunRangeNone, krnlEventNone, CompressorShadow.mClusterStatus, Compressor.mMaxClusters * sizeof(CompressorShadow.mClusterStatus[0]));
@@ -211,9 +210,16 @@ int GPUChainTracking::RunTPCDecompression()
   LOGP(info, "====== Decompression");
 
 #ifdef GPUCA_HAVE_O2HEADERS
+  mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
   RecoStep myStep = RecoStep::TPCDecompression;
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true
+  GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
+  GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
   const auto& threadContext = GetThreadContext();
+  SetupGPUProcessor(&Decompressor, false);
+  WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
+  TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
+
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
     this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;

From ceb00a6cdfc6cbe65f712d2b2fbac8149705cee7 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Thu, 23 Nov 2023 15:34:42 +0100
Subject: [PATCH 08/25] Test memcopy from host to gpu working and fixed typo

---
 GPU/GPUTracking/Base/GPUReconstruction.h                 | 2 +-
 GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx  | 9 ++++++++-
 GPU/GPUTracking/DataCompression/GPUTPCDecompression.h    | 5 ++++-
 .../DataCompression/GPUTPCDecompressionKernels.cxx       | 3 +--
 GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx   | 8 +++++---
 5 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 8b56e46888e9b..518f0af43aae4 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -511,7 +511,7 @@ inline void GPUReconstruction::RegisterGPUProcessor(T* proc, bool deviceSlave)
 template <class T>
 inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
 {
-  static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derrived class");
+  static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class");
   if (allocate) {
     proc->SetMaxData(mHostConstantMem->ioPtrs);
   }
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index 5a1d3aa84e3ef..b0ed4fded0eb1 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -21,8 +21,15 @@ using namespace GPUCA_NAMESPACE::gpu;
 
 void GPUTPCDecompression::InitializeProcessor() {}
 
-void GPUTPCDecompression::RegisterMemoryAllocation()
+void* GPUTPCDecompression::SetPointersMemory(void* mem)
 {
+  computePointerWithAlignment(mem, testP);
+  return mem;
+}
+
+void GPUTPCDecompression::RegisterMemoryAllocation()
+{  AllocateAndInitializeLate();
+  mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory");
 }
 
 void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 9e4def065121f..86c851e015036 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -30,7 +30,10 @@ class GPUTPCDecompression : public GPUProcessor
 
  public:
   unsigned int test = 42;
-  unsigned int* testP;
+  unsigned int* testP = nullptr;
+
+  void* SetPointersMemory(void* mem);
+
 #ifndef GPUCA_GPUCODE
   void InitializeProcessor();
   void RegisterMemoryAllocation();
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index 9cd1ef7484473..3179cb6d5650b 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -26,9 +26,8 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::tes
   GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
   GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
   unsigned int x = decompressor.test;
-  unsigned int y = compressor.mTest;
   if (!iThread && !iBlock) {
-    GPUInfo("==== Test: X={%d} Y={%d} \n", x, y);
+    GPUInfo("==== Test: X={%d}, *testP = {%d} \n", x, *decompressor.testP);
   }
 
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 56c8209ae0d81..d6cd5a26c0028 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -210,16 +210,18 @@ int GPUChainTracking::RunTPCDecompression()
   LOGP(info, "====== Decompression");
 
 #ifdef GPUCA_HAVE_O2HEADERS
-  mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
+ // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
   RecoStep myStep = RecoStep::TPCDecompression;
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true
   GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
   GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
   const auto& threadContext = GetThreadContext();
   SetupGPUProcessor(&Decompressor, false);
+  new (Decompressor.testP) unsigned int;
+  *Decompressor.testP = 145;
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
   TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
-
+  LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP);
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
     this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;
@@ -234,7 +236,6 @@ int GPUChainTracking::RunTPCDecompression()
   }
   gatherTimer.Stop();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
-  LOGP(info, "====== isGPU: {} ", mRec->IsGPU());
   if (mRec->IsGPU()) {
     runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
@@ -247,6 +248,7 @@ int GPUChainTracking::RunTPCDecompression()
     TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0);
     SynchronizeStream(0);
   }
+  // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR"));
 #endif
   return 0;
 }

From ae2b6c44c2359ad25826d47c20bbac4cc5800335 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Sat, 2 Dec 2023 10:11:18 +0100
Subject: [PATCH 09/25] Created inputBuffer on GPU for holding compressed
 clusters for decompression

---
 .../DataCompression/GPUTPCDecompression.cxx   | 52 +++++++++++++++++--
 .../DataCompression/GPUTPCDecompression.h     | 30 ++++++++++-
 .../Global/GPUChainTrackingCompression.cxx    | 18 ++++---
 cmake-build-debug/DartConfiguration.tcl       |  4 +-
 4 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index b0ed4fded0eb1..760e5174bd827 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -13,6 +13,7 @@
 /// \author Gabriele Cimador
 
 #include "GPUTPCDecompression.h"
+#include "GPUTPCCompression.h"
 #include "GPUReconstruction.h"
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
@@ -21,18 +22,61 @@ using namespace GPUCA_NAMESPACE::gpu;
 
 void GPUTPCDecompression::InitializeProcessor() {}
 
+void* GPUTPCDecompression::SetPointersInputGPU(void* mem)
+{
+  SetPointersCompressedClusters(mem, *mInputGPU, mInputGPU->nAttachedClusters, mInputGPU->nTracks, mInputGPU->nUnattachedClusters, true);
+  return mem;
+}
+
+template <class T>
+void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA)
+{
+  computePointerWithAlignment(mem, c.qTotU, nClU); // Do not reorder, qTotU ist used as first address in GPUChainTracking::RunTPCCompression
+  computePointerWithAlignment(mem, c.qMaxU, nClU);
+  computePointerWithAlignment(mem, c.flagsU, nClU);
+  computePointerWithAlignment(mem, c.padDiffU, nClU);
+  computePointerWithAlignment(mem, c.timeDiffU, nClU);
+  computePointerWithAlignment(mem, c.sigmaPadU, nClU);
+  computePointerWithAlignment(mem, c.sigmaTimeU, nClU);
+  computePointerWithAlignment(mem, c.nSliceRowClusters, GPUCA_ROW_COUNT * NSLICES);
+
+  unsigned int nClAreduced = reducedClA ? nClA - nTr : nClA;
+
+  if (!(mRec->GetParam().rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) {
+    return; // Track model disabled, do not allocate memory
+  }
+  computePointerWithAlignment(mem, c.qTotA, nClA);
+  computePointerWithAlignment(mem, c.qMaxA, nClA);
+  computePointerWithAlignment(mem, c.flagsA, nClA);
+  computePointerWithAlignment(mem, c.rowDiffA, nClAreduced);
+  computePointerWithAlignment(mem, c.sliceLegDiffA, nClAreduced);
+  computePointerWithAlignment(mem, c.padResA, nClAreduced);
+  computePointerWithAlignment(mem, c.timeResA, nClAreduced);
+  computePointerWithAlignment(mem, c.sigmaPadA, nClA);
+  computePointerWithAlignment(mem, c.sigmaTimeA, nClA);
+
+  computePointerWithAlignment(mem, c.qPtA, nTr);
+  computePointerWithAlignment(mem, c.rowA, nTr);
+  computePointerWithAlignment(mem, c.sliceA, nTr);
+  computePointerWithAlignment(mem, c.timeA, nTr);
+  computePointerWithAlignment(mem, c.padA, nTr);
+
+  computePointerWithAlignment(mem, c.nTrackClusters, nTr);
+}
+
 void* GPUTPCDecompression::SetPointersMemory(void* mem)
 {
+  computePointerWithAlignment(mem, mInputGPU);
   computePointerWithAlignment(mem, testP);
   return mem;
 }
 
-void GPUTPCDecompression::RegisterMemoryAllocation()
-{  AllocateAndInitializeLate();
+void GPUTPCDecompression::RegisterMemoryAllocation() {
+  AllocateAndInitializeLate();
+  mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT, "TPCDecompressionInput");
   mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory");
 }
 
-void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)
-{
+void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){
 
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 86c851e015036..37cc883e1db9c 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -20,6 +20,20 @@
 #include "GPUCommonMath.h"
 #include "GPUParam.h"
 
+#ifdef GPUCA_HAVE_O2HEADERS
+#include "DataFormatsTPC/CompressedClusters.h"
+#else
+namespace o2::tpc
+{
+struct CompressedClustersPtrs {
+};
+struct CompressedClusters {
+};
+struct CompressedClustersFlat {
+};
+} // namespace o2::tpc
+#endif
+
 namespace GPUCA_NAMESPACE::gpu
 {
 
@@ -32,13 +46,25 @@ class GPUTPCDecompression : public GPUProcessor
   unsigned int test = 42;
   unsigned int* testP = nullptr;
 
-  void* SetPointersMemory(void* mem);
-
 #ifndef GPUCA_GPUCODE
   void InitializeProcessor();
   void RegisterMemoryAllocation();
   void SetMaxData(const GPUTrackingInOutPointers& io);
+
+  void* SetPointersInputGPU(void* mem);
+  void* SetPointersMemory(void* mem);
 #endif
+
+ protected:
+  constexpr static unsigned int NSLICES = GPUCA_NSLICES;
+
+  template <class T>
+  void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);
+
+  o2::tpc::CompressedClusters* mInputGPU = nullptr;
+
+  short mMemoryResInputGPU = -1;
+
 };
 }
 #endif // GPUTPCDECOMPRESSION_H
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index d6cd5a26c0028..2de1f8d08c649 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -29,7 +29,7 @@ using namespace o2::tpc;
 int GPUChainTracking::RunTPCCompression()
 {
   LOGP(info, "====== Compression");
-#ifdef GPUCA_HAVE_O2HEADERS
+//#ifdef GPUCA_HAVE_O2HEADERS
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR"));
   RecoStep myStep = RecoStep::TPCCompression;
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCCompression;
@@ -201,7 +201,7 @@ int GPUChainTracking::RunTPCCompression()
     ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec);
   }
   mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR"));
-#endif
+//#endif
   return 0;
 }
 
@@ -209,7 +209,7 @@ int GPUChainTracking::RunTPCDecompression()
 {
   LOGP(info, "====== Decompression");
 
-#ifdef GPUCA_HAVE_O2HEADERS
+//#ifdef GPUCA_HAVE_O2HEADERS
  // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
   RecoStep myStep = RecoStep::TPCDecompression;
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true
@@ -217,11 +217,16 @@ int GPUChainTracking::RunTPCDecompression()
   GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
   const auto& threadContext = GetThreadContext();
   SetupGPUProcessor(&Decompressor, false);
+
   new (Decompressor.testP) unsigned int;
   *Decompressor.testP = 145;
+  LOGP(info,"==== mInputGPU pointer host: {} -- dev: {}", (void*)Decompressor.mInputGPU,(void*)DecompressorShadow.mInputGPU);
+
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
   TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
+
   LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP);
+
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
     this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;
@@ -238,6 +243,7 @@ int GPUChainTracking::RunTPCDecompression()
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   if (mRec->IsGPU()) {
     runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
+
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
     processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
     WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
@@ -249,8 +255,6 @@ int GPUChainTracking::RunTPCDecompression()
     SynchronizeStream(0);
   }
   // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR"));
-#endif
+//#endif
   return 0;
-}
-
-
+}
\ No newline at end of file
diff --git a/cmake-build-debug/DartConfiguration.tcl b/cmake-build-debug/DartConfiguration.tcl
index 2077a80fa68de..09b3136cfee5f 100644
--- a/cmake-build-debug/DartConfiguration.tcl
+++ b/cmake-build-debug/DartConfiguration.tcl
@@ -27,8 +27,8 @@ SubmitInactivityTimeout:
 NightlyStartTime: 00:00:00 EDT
 
 # Commands for the build/test/submit cycle
-ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2"
-MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/2203e709e1a54_CLion-233.11555.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}"
+ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2"
+MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}"
 DefaultCTestConfigurationType: Release
 
 # version control

From 9fd661f7c0df9d62a5cefabe196c7b2e555189af Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Mon, 4 Dec 2023 12:04:00 +0100
Subject: [PATCH 10/25] Correctly setted attributes on compressed clusters
 input for decompression

---
 .../DataCompression/GPUTPCDecompression.cxx   |  1 +
 .../DataCompression/GPUTPCDecompression.h     |  3 +-
 .../GPUTPCDecompressionKernels.cxx            |  3 ++
 .../Global/GPUChainTrackingCompression.cxx    | 33 +++++++++++++++++--
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index 760e5174bd827..bfd2cc94ccca8 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -17,6 +17,7 @@
 #include "GPUReconstruction.h"
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
+#include "GPULogging.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
 
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 37cc883e1db9c..1df3d352ae1ec 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -45,6 +45,8 @@ class GPUTPCDecompression : public GPUProcessor
  public:
   unsigned int test = 42;
   unsigned int* testP = nullptr;
+  o2::tpc::CompressedClusters* mInputGPU = nullptr; // TODO: move to protected once test is assessed
+
 
 #ifndef GPUCA_GPUCODE
   void InitializeProcessor();
@@ -61,7 +63,6 @@ class GPUTPCDecompression : public GPUProcessor
   template <class T>
   void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);
 
-  o2::tpc::CompressedClusters* mInputGPU = nullptr;
 
   short mMemoryResInputGPU = -1;
 
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index 3179cb6d5650b..cc7557dd6e523 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -28,6 +28,9 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::tes
   unsigned int x = decompressor.test;
   if (!iThread && !iBlock) {
     GPUInfo("==== Test: X={%d}, *testP = {%d} \n", x, *decompressor.testP);
+    GPUInfo("==== mInputGPU.nAttached on GPU: {%d} \n", decompressor.mInputGPU->nAttachedClusters);
+    GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU->nAttachedClusters,decompressor.mInputGPU->nUnattachedClusters,decompressor.mInputGPU->nTracks);
+
   }
 
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 2de1f8d08c649..326531587f75b 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -216,16 +216,40 @@ int GPUChainTracking::RunTPCDecompression()
   GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
   GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
   const auto& threadContext = GetThreadContext();
-  SetupGPUProcessor(&Decompressor, false);
+
+  CompressedClusters c;
+  const CompressedClusters* p;
+  CompressedClusters* compressedClustersInputGPU = Decompressor.mInputGPU;
+  memset((void*)compressedClustersInputGPU, 0, sizeof(*compressedClustersInputGPU));
+  c = *mIOPtrs.tpcCompressedClusters;
+  p = &c;
+  compressedClustersInputGPU->nAttachedClusters = p->nAttachedClusters;
+  compressedClustersInputGPU->nUnattachedClusters = p->nUnattachedClusters;
+  compressedClustersInputGPU->nTracks = p->nTracks;
+  compressedClustersInputGPU->nAttachedClustersReduced = compressedClustersInputGPU->nAttachedClusters - compressedClustersInputGPU->nTracks;
+  compressedClustersInputGPU->nSliceRows = NSLICES * GPUCA_ROW_COUNT;
+  compressedClustersInputGPU->nComppressionModes = param().rec.tpc.compressionTypeMask;
+  compressedClustersInputGPU->solenoidBz = param().bzkG;
+  compressedClustersInputGPU->maxTimeBin = param().par.continuousMaxTimeBin;
+  SetupGPUProcessor(&Decompressor, true);
 
   new (Decompressor.testP) unsigned int;
   *Decompressor.testP = 145;
-  LOGP(info,"==== mInputGPU pointer host: {} -- dev: {}", (void*)Decompressor.mInputGPU,(void*)DecompressorShadow.mInputGPU);
 
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
+
   TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
 
   LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP);
+  unsigned int offset = 0;
+  char direction = 0;
+  int outputStream = 0;
+  //GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mInputGPU->nSliceRowClusters[0]), outputStream, direction);
+ // GPUMemCpyAlways(myStep, O->nTrackClusters, P->nTrackClusters, O->nTracks * sizeof(O->nTrackClusters[0]), outputStream, direction);
+ /* for (int i = 0; i < compressedClustersInputGPU->nTracks; ++i) {
+    GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->qTotA + offset, p->qTotA + p.mAttachedClusterFirstIndex[i], compressedClustersInputGPU->nTrackClusters[i] * sizeof(DecompressorShadow.mInputGPU->qTotA[0]), outputStream, direction);
+    offset += O->nTrackClusters[i];
+  }*/
 
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
@@ -235,6 +259,9 @@ int GPUChainTracking::RunTPCDecompression()
   };
   auto& gatherTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
   gatherTimer.Start();
+
+  LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",p->nAttachedClusters,p->nUnattachedClusters,p->nTracks);
+
   if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) {
     GPUError("Error decompressing clusters");
     return 1;
@@ -242,7 +269,7 @@ int GPUChainTracking::RunTPCDecompression()
   gatherTimer.Stop();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   if (mRec->IsGPU()) {
-    runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
+    runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>({1,1,0}, krnlRunRangeNone, krnlEventNone);
 
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
     processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;

From 12c85d5ed3c7843ea97c2f73cb8d3b62cf72c611 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Mon, 4 Dec 2023 19:14:56 +0100
Subject: [PATCH 11/25] successfully copied nSliceRowClusters from host to gpu
 through context

---
 .../DataCompression/GPUTPCDecompression.cxx   |  6 ++---
 .../DataCompression/GPUTPCDecompression.h     |  2 +-
 .../GPUTPCDecompressionKernels.cxx            |  5 ++--
 .../Global/GPUChainTrackingCompression.cxx    | 24 +++++++++----------
 4 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index bfd2cc94ccca8..24c0290675ad9 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -25,7 +25,7 @@ void GPUTPCDecompression::InitializeProcessor() {}
 
 void* GPUTPCDecompression::SetPointersInputGPU(void* mem)
 {
-  SetPointersCompressedClusters(mem, *mInputGPU, mInputGPU->nAttachedClusters, mInputGPU->nTracks, mInputGPU->nUnattachedClusters, true);
+  SetPointersCompressedClusters(mem, mInputGPU, mInputGPU.nAttachedClusters, mInputGPU.nTracks, mInputGPU.nUnattachedClusters, true);
   return mem;
 }
 
@@ -67,14 +67,14 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign
 
 void* GPUTPCDecompression::SetPointersMemory(void* mem)
 {
-  computePointerWithAlignment(mem, mInputGPU);
+  //computePointerWithAlignment(mem, mInputGPU);
   computePointerWithAlignment(mem, testP);
   return mem;
 }
 
 void GPUTPCDecompression::RegisterMemoryAllocation() {
   AllocateAndInitializeLate();
-  mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT, "TPCDecompressionInput");
+  mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU, "TPCDecompressionInput");
   mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory");
 }
 
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 1df3d352ae1ec..5eb24fb3056f3 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -45,7 +45,7 @@ class GPUTPCDecompression : public GPUProcessor
  public:
   unsigned int test = 42;
   unsigned int* testP = nullptr;
-  o2::tpc::CompressedClusters* mInputGPU = nullptr; // TODO: move to protected once test is assessed
+  o2::tpc::CompressedClusters mInputGPU; // TODO: move to protected once test is assessed
 
 
 #ifndef GPUCA_GPUCODE
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index cc7557dd6e523..dcfc2712fb6fd 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -28,9 +28,8 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::tes
   unsigned int x = decompressor.test;
   if (!iThread && !iBlock) {
     GPUInfo("==== Test: X={%d}, *testP = {%d} \n", x, *decompressor.testP);
-    GPUInfo("==== mInputGPU.nAttached on GPU: {%d} \n", decompressor.mInputGPU->nAttachedClusters);
-    GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU->nAttachedClusters,decompressor.mInputGPU->nUnattachedClusters,decompressor.mInputGPU->nTracks);
-
+    GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU.nAttachedClusters,decompressor.mInputGPU.nUnattachedClusters,decompressor.mInputGPU.nTracks);
+    GPUInfo("===== nSliceRowClusters[1] on gpu = %d",decompressor.mInputGPU.nSliceRowClusters[1]);
   }
 
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 326531587f75b..c29aa3f1860d9 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -219,18 +219,17 @@ int GPUChainTracking::RunTPCDecompression()
 
   CompressedClusters c;
   const CompressedClusters* p;
-  CompressedClusters* compressedClustersInputGPU = Decompressor.mInputGPU;
-  memset((void*)compressedClustersInputGPU, 0, sizeof(*compressedClustersInputGPU));
+  CompressedClusters& compressedClustersInputGPU = Decompressor.mInputGPU;
   c = *mIOPtrs.tpcCompressedClusters;
   p = &c;
-  compressedClustersInputGPU->nAttachedClusters = p->nAttachedClusters;
-  compressedClustersInputGPU->nUnattachedClusters = p->nUnattachedClusters;
-  compressedClustersInputGPU->nTracks = p->nTracks;
-  compressedClustersInputGPU->nAttachedClustersReduced = compressedClustersInputGPU->nAttachedClusters - compressedClustersInputGPU->nTracks;
-  compressedClustersInputGPU->nSliceRows = NSLICES * GPUCA_ROW_COUNT;
-  compressedClustersInputGPU->nComppressionModes = param().rec.tpc.compressionTypeMask;
-  compressedClustersInputGPU->solenoidBz = param().bzkG;
-  compressedClustersInputGPU->maxTimeBin = param().par.continuousMaxTimeBin;
+  compressedClustersInputGPU.nAttachedClusters = p->nAttachedClusters;
+  compressedClustersInputGPU.nUnattachedClusters = p->nUnattachedClusters;
+  compressedClustersInputGPU.nTracks = p->nTracks;
+  compressedClustersInputGPU.nAttachedClustersReduced = compressedClustersInputGPU.nAttachedClusters - compressedClustersInputGPU.nTracks;
+  compressedClustersInputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT;
+  compressedClustersInputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask;
+  compressedClustersInputGPU.solenoidBz = param().bzkG;
+  compressedClustersInputGPU.maxTimeBin = param().par.continuousMaxTimeBin;
   SetupGPUProcessor(&Decompressor, true);
 
   new (Decompressor.testP) unsigned int;
@@ -242,9 +241,10 @@ int GPUChainTracking::RunTPCDecompression()
 
   LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP);
   unsigned int offset = 0;
-  char direction = 0;
+  char direction = 1;
   int outputStream = 0;
-  //GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mInputGPU->nSliceRowClusters[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU.nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(p->nSliceRowClusters[0]), outputStream, direction);
+  LOGP(info,"===== nSliceRowClusters[1] on host = {}",p->nSliceRowClusters[1]);
  // GPUMemCpyAlways(myStep, O->nTrackClusters, P->nTrackClusters, O->nTracks * sizeof(O->nTrackClusters[0]), outputStream, direction);
  /* for (int i = 0; i < compressedClustersInputGPU->nTracks; ++i) {
     GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->qTotA + offset, p->qTotA + p.mAttachedClusterFirstIndex[i], compressedClustersInputGPU->nTrackClusters[i] * sizeof(DecompressorShadow.mInputGPU->qTotA[0]), outputStream, direction);

From bc8b2d0f99e9d8e7270c9049bc5d36e872fa3545 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Tue, 5 Dec 2023 15:31:27 +0100
Subject: [PATCH 12/25] input completely copied from host to device
 (correctness of content has to be checked)

---
 .../Global/GPUChainTrackingCompression.cxx    | 71 +++++++++++++------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index c29aa3f1860d9..f0023931916ee 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -217,19 +217,16 @@ int GPUChainTracking::RunTPCDecompression()
   GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
   const auto& threadContext = GetThreadContext();
 
-  CompressedClusters c;
-  const CompressedClusters* p;
-  CompressedClusters& compressedClustersInputGPU = Decompressor.mInputGPU;
-  c = *mIOPtrs.tpcCompressedClusters;
-  p = &c;
-  compressedClustersInputGPU.nAttachedClusters = p->nAttachedClusters;
-  compressedClustersInputGPU.nUnattachedClusters = p->nUnattachedClusters;
-  compressedClustersInputGPU.nTracks = p->nTracks;
-  compressedClustersInputGPU.nAttachedClustersReduced = compressedClustersInputGPU.nAttachedClusters - compressedClustersInputGPU.nTracks;
-  compressedClustersInputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT;
-  compressedClustersInputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask;
-  compressedClustersInputGPU.solenoidBz = param().bzkG;
-  compressedClustersInputGPU.maxTimeBin = param().par.continuousMaxTimeBin;
+  CompressedClusters cmprClsHost = *mIOPtrs.tpcCompressedClusters;
+  CompressedClusters& inputGPU = Decompressor.mInputGPU;
+  inputGPU.nAttachedClusters = cmprClsHost.nAttachedClusters;
+  inputGPU.nUnattachedClusters = cmprClsHost.nUnattachedClusters;
+  inputGPU.nTracks = cmprClsHost.nTracks;
+  inputGPU.nAttachedClustersReduced = inputGPU.nAttachedClusters - inputGPU.nTracks;
+  inputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT;
+  inputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask;
+  inputGPU.solenoidBz = param().bzkG;
+  inputGPU.maxTimeBin = param().par.continuousMaxTimeBin;
   SetupGPUProcessor(&Decompressor, true);
 
   new (Decompressor.testP) unsigned int;
@@ -238,18 +235,48 @@ int GPUChainTracking::RunTPCDecompression()
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
 
   TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
-
+  LOGP(info,"==== nSliceRowClusters[1] on host: {}",cmprClsHost.nSliceRowClusters[1]);
   LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP);
   unsigned int offset = 0;
   char direction = 1;
   int outputStream = 0;
-  GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU.nSliceRowClusters, p->nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(p->nSliceRowClusters[0]), outputStream, direction);
-  LOGP(info,"===== nSliceRowClusters[1] on host = {}",p->nSliceRowClusters[1]);
- // GPUMemCpyAlways(myStep, O->nTrackClusters, P->nTrackClusters, O->nTracks * sizeof(O->nTrackClusters[0]), outputStream, direction);
- /* for (int i = 0; i < compressedClustersInputGPU->nTracks; ++i) {
-    GPUMemCpyAlways(myStep, DecompressorShadow.mInputGPU->qTotA + offset, p->qTotA + p.mAttachedClusterFirstIndex[i], compressedClustersInputGPU->nTrackClusters[i] * sizeof(DecompressorShadow.mInputGPU->qTotA[0]), outputStream, direction);
-    offset += O->nTrackClusters[i];
-  }*/
+  CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
+  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction);
+  SynchronizeStream(outputStream);
+  for (unsigned int i = 0; i < NSLICES; i++) {
+    for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
+      GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction);
+      GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU + offset, cmprClsHost.qMaxU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qMaxU[0]), outputStream, direction);
+      GPUMemCpyAlways(myStep, inputGPUShadow.flagsU + offset, cmprClsHost.flagsU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.flagsU[0]), outputStream, direction);
+      GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU + offset, cmprClsHost.padDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.padDiffU[0]), outputStream, direction);
+      GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU + offset, cmprClsHost.timeDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.timeDiffU[0]), outputStream, direction);
+      GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU + offset, cmprClsHost.sigmaPadU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, direction);
+      GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU + offset, cmprClsHost.sigmaTimeU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, direction);
+      offset += cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
+    }
+  }
+
+  offset = 0;
+  for (unsigned int i = 0; i < inputGPUShadow.nTracks; i++) {
+    GPUMemCpyAlways(myStep, inputGPUShadow.qTotA + offset, cmprClsHost.qTotA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qTotA[0]), outputStream, direction);
+    GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA + offset, cmprClsHost.qMaxA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qMaxA[0]), outputStream, direction);
+    GPUMemCpyAlways(myStep, inputGPUShadow.flagsA + offset, cmprClsHost.flagsA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.flagsA[0]), outputStream, direction);
+    GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA + offset, cmprClsHost.sigmaPadA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, direction);
+    GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA + offset, cmprClsHost.sigmaTimeA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, direction);
+
+    // First index stored with track
+    GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA + offset - i, cmprClsHost.rowDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, direction);
+    GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA + offset - i, cmprClsHost.sliceLegDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, direction);
+    GPUMemCpyAlways(myStep, inputGPUShadow.padResA + offset - i, cmprClsHost.padResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.padResA[0]), outputStream, direction);
+    GPUMemCpyAlways(myStep, inputGPUShadow.timeResA + offset - i, cmprClsHost.timeResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.timeResA[0]), outputStream, direction);
+    offset += cmprClsHost.nTrackClusters[i];
+  }
+  GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, direction);
 
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
@@ -260,7 +287,7 @@ int GPUChainTracking::RunTPCDecompression()
   auto& gatherTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
   gatherTimer.Start();
 
-  LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",p->nAttachedClusters,p->nUnattachedClusters,p->nTracks);
+  LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",cmprClsHost.nAttachedClusters,cmprClsHost.nUnattachedClusters,cmprClsHost.nTracks);
 
   if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) {
     GPUError("Error decompressing clusters");

From ac02a0bbdcc2d8509c2c60d8373c4aec59553819 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Tue, 5 Dec 2023 17:12:58 +0100
Subject: [PATCH 13/25] addedo MEMORY::CUSTOM to GPUInput and single GPUMemCpy
 for whole input

---
 .../DataCompression/GPUTPCDecompression.cxx   | 10 +---------
 .../DataCompression/GPUTPCDecompression.h     | 13 +++----------
 .../GPUTPCDecompressionKernels.cxx            |  2 --
 .../Global/GPUChainTrackingCompression.cxx    | 19 +++++++++----------
 4 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index 24c0290675ad9..03595069a52f0 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -65,17 +65,9 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign
   computePointerWithAlignment(mem, c.nTrackClusters, nTr);
 }
 
-void* GPUTPCDecompression::SetPointersMemory(void* mem)
-{
-  //computePointerWithAlignment(mem, mInputGPU);
-  computePointerWithAlignment(mem, testP);
-  return mem;
-}
-
 void GPUTPCDecompression::RegisterMemoryAllocation() {
   AllocateAndInitializeLate();
-  mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU, "TPCDecompressionInput");
-  mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersMemory, GPUMemoryResource::MEMORY_PERMANENT, "TPCDecompressionMemory");
+  mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput");
 }
 
 void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 5eb24fb3056f3..42bbb6390f5db 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -39,15 +39,10 @@ namespace GPUCA_NAMESPACE::gpu
 
 class GPUTPCDecompression : public GPUProcessor
 {
-  friend class GPUTPCDecmpressionKernels;
+  friend class GPUTPCDecompressionKernels;
   friend class GPUChainTracking;
 
  public:
-  unsigned int test = 42;
-  unsigned int* testP = nullptr;
-  o2::tpc::CompressedClusters mInputGPU; // TODO: move to protected once test is assessed
-
-
 #ifndef GPUCA_GPUCODE
   void InitializeProcessor();
   void RegisterMemoryAllocation();
@@ -59,13 +54,11 @@ class GPUTPCDecompression : public GPUProcessor
 
  protected:
   constexpr static unsigned int NSLICES = GPUCA_NSLICES;
-
+  o2::tpc::CompressedClusters mInputGPU;
   template <class T>
   void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);
 
-
   short mMemoryResInputGPU = -1;
-
 };
-}
+} // namespace GPUCA_NAMESPACE::gpu
 #endif // GPUTPCDECOMPRESSION_H
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index dcfc2712fb6fd..c49e62c34b581 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -25,9 +25,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::tes
 {
   GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
   GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
-  unsigned int x = decompressor.test;
   if (!iThread && !iBlock) {
-    GPUInfo("==== Test: X={%d}, *testP = {%d} \n", x, *decompressor.testP);
     GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU.nAttachedClusters,decompressor.mInputGPU.nUnattachedClusters,decompressor.mInputGPU.nTracks);
     GPUInfo("===== nSliceRowClusters[1] on gpu = %d",decompressor.mInputGPU.nSliceRowClusters[1]);
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index f0023931916ee..3faa25e7f736a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -84,7 +84,6 @@ int GPUChainTracking::RunTPCCompression()
     SynchronizeStream(OutputStream()); // Synchronize output copies running in parallel from memory that might be released, only the following async copy from stacked memory is safe after the chain finishes.
     outputStream = OutputStream();
   }
-
   if (ProcessingSettings().tpcCompressionGatherMode >= 2) {
     if (ProcessingSettings().tpcCompressionGatherMode == 2) {
       void* devicePtr = mRec->getGPUPointer(Compressor.mOutputFlat);
@@ -229,21 +228,21 @@ int GPUChainTracking::RunTPCDecompression()
   inputGPU.maxTimeBin = param().par.continuousMaxTimeBin;
   SetupGPUProcessor(&Decompressor, true);
 
-  new (Decompressor.testP) unsigned int;
-  *Decompressor.testP = 145;
-
+  size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU);
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
-
   TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
+
   LOGP(info,"==== nSliceRowClusters[1] on host: {}",cmprClsHost.nSliceRowClusters[1]);
-  LOGP(info, "==== Decompressor testP: {}, DecompressorShadow testP: {}",(void*)Decompressor.testP,(void*)DecompressorShadow.testP);
+
   unsigned int offset = 0;
   char direction = 1;
   int outputStream = 0;
   CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
-  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction);
-  GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction);
-  SynchronizeStream(outputStream);
+  char* deviceFlatPts = (char*)inputGPUShadow.qTotU;
+  GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true);
+  //GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction);
+  //GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction);
+  SynchronizeStream(outputStream);/*
   for (unsigned int i = 0; i < NSLICES; i++) {
     for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
       GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction);
@@ -277,7 +276,7 @@ int GPUChainTracking::RunTPCDecompression()
   GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, direction);
   GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, direction);
   GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, direction);
-
+*/
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
     this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;

From 47e7e3430ceabf49fd33cfa8daaceb3cd827e80d Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Tue, 5 Dec 2023 17:46:15 +0100
Subject: [PATCH 14/25] refactoring and tested correctness of GPUMemCpy from
 host to device for decompression input

---
 .../DataCompression/GPUTPCDecompressionKernels.cxx |  1 -
 .../Global/GPUChainTrackingCompression.cxx         | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index c49e62c34b581..65c6fc0e6587c 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -27,7 +27,6 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::tes
   GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
   if (!iThread && !iBlock) {
     GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU.nAttachedClusters,decompressor.mInputGPU.nUnattachedClusters,decompressor.mInputGPU.nTracks);
-    GPUInfo("===== nSliceRowClusters[1] on gpu = %d",decompressor.mInputGPU.nSliceRowClusters[1]);
   }
 
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 3faa25e7f736a..60e757e66683e 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -232,17 +232,17 @@ int GPUChainTracking::RunTPCDecompression()
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
   TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
 
-  LOGP(info,"==== nSliceRowClusters[1] on host: {}",cmprClsHost.nSliceRowClusters[1]);
-
-  unsigned int offset = 0;
-  char direction = 1;
   int outputStream = 0;
   CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
   char* deviceFlatPts = (char*)inputGPUShadow.qTotU;
   GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true);
-  //GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction);
-  //GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction);
-  SynchronizeStream(outputStream);/*
+  SynchronizeStream(outputStream);
+  /*
+  unsigned int offset = 0;
+  char direction = 1;
+  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction);
+
   for (unsigned int i = 0; i < NSLICES; i++) {
     for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
       GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction);

From a62865dbe89e1bdbe0a343f2567bab13703f7e7b Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Thu, 7 Dec 2023 18:02:42 +0100
Subject: [PATCH 15/25] allocated output buffers for host and device

---
 GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 60e757e66683e..de7836e7fde0a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -237,6 +237,14 @@ int GPUChainTracking::RunTPCDecompression()
   char* deviceFlatPts = (char*)inputGPUShadow.qTotU;
   GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true);
   SynchronizeStream(outputStream);
+
+  mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
+  //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput);
+  AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
+  processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
+  WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
+
+
   /*
   unsigned int offset = 0;
   char direction = 1;

From 60a53c5d390419a18a8bd9501b488907ccb8e3e8 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Mon, 11 Dec 2023 10:03:40 +0100
Subject: [PATCH 16/25] Decoding input from host to device made per internal
 buffer

---
 .../Global/GPUChainTrackingCompression.cxx    | 75 ++++++++-----------
 .../Standalone/Benchmark/standalone.cxx       |  1 +
 2 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index de7836e7fde0a..10f8565992815 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -234,56 +234,45 @@ int GPUChainTracking::RunTPCDecompression()
 
   int outputStream = 0;
   CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
-  char* deviceFlatPts = (char*)inputGPUShadow.qTotU;
-  GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true);
+  //char* deviceFlatPts = (char*)inputGPUShadow.qTotU;
+  //GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true);
   SynchronizeStream(outputStream);
 
-  mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
-  //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput);
-  AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
-  processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
-  WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
+  unsigned int offset = 0;
+  bool toGPU = true;
 
+  GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.flagsU, cmprClsHost.flagsU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.flagsU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU, cmprClsHost.padDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.padDiffU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU, cmprClsHost.timeDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.timeDiffU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU);
 
-  /*
-  unsigned int offset = 0;
-  char direction = 1;
-  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, direction);
-  GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, inputGPUShadow.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, direction);
+  GPUMemCpyAlways(myStep, inputGPUShadow.qTotA, cmprClsHost.qTotA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qTotA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA, cmprClsHost.qMaxA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qMaxA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.flagsA, cmprClsHost.flagsA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.flagsA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA, cmprClsHost.rowDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA, cmprClsHost.sliceLegDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.padResA, cmprClsHost.padResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.padResA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.timeResA, cmprClsHost.timeResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.timeResA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA, cmprClsHost.sigmaPadA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA, cmprClsHost.sigmaTimeA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, toGPU);
 
-  for (unsigned int i = 0; i < NSLICES; i++) {
-    for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
-      GPUMemCpyAlways(myStep, inputGPUShadow.qTotU + offset, cmprClsHost.qTotU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qTotU[0]), outputStream, direction);
-      GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU + offset, cmprClsHost.qMaxU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.qMaxU[0]), outputStream, direction);
-      GPUMemCpyAlways(myStep, inputGPUShadow.flagsU + offset, cmprClsHost.flagsU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.flagsU[0]), outputStream, direction);
-      GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU + offset, cmprClsHost.padDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.padDiffU[0]), outputStream, direction);
-      GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU + offset, cmprClsHost.timeDiffU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.timeDiffU[0]), outputStream, direction);
-      GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU + offset, cmprClsHost.sigmaPadU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, direction);
-      GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU + offset, cmprClsHost.sigmaTimeU + offset, cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, direction);
-      offset += cmprClsHost.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
-    }
-  }
+  GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU);
 
-  offset = 0;
-  for (unsigned int i = 0; i < inputGPUShadow.nTracks; i++) {
-    GPUMemCpyAlways(myStep, inputGPUShadow.qTotA + offset, cmprClsHost.qTotA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qTotA[0]), outputStream, direction);
-    GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA + offset, cmprClsHost.qMaxA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.qMaxA[0]), outputStream, direction);
-    GPUMemCpyAlways(myStep, inputGPUShadow.flagsA + offset, cmprClsHost.flagsA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.flagsA[0]), outputStream, direction);
-    GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA + offset, cmprClsHost.sigmaPadA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, direction);
-    GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA + offset, cmprClsHost.sigmaTimeA + offset, cmprClsHost.nTrackClusters[i] * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, direction);
 
-    // First index stored with track
-    GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA + offset - i, cmprClsHost.rowDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, direction);
-    GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA + offset - i, cmprClsHost.sliceLegDiffA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, direction);
-    GPUMemCpyAlways(myStep, inputGPUShadow.padResA + offset - i, cmprClsHost.padResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.padResA[0]), outputStream, direction);
-    GPUMemCpyAlways(myStep, inputGPUShadow.timeResA + offset - i, cmprClsHost.timeResA + offset + 1, (cmprClsHost.nTrackClusters[i] - 1) * sizeof(cmprClsHost.timeResA[0]), outputStream, direction);
-    offset += cmprClsHost.nTrackClusters[i];
-  }
-  GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, direction);
-  GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, direction);
-  GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, direction);
-  GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, direction);
-  GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, direction);
+  mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
+  /*
+  AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput);
+  AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
+  processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
+  WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
 */
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index 8df8094fe44d3..18fb7e9aeacea 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -948,6 +948,7 @@ int main(int argc, char** argv)
       printf("Error unregistering memory\n");
     }
   }
+  exit(0);
   rec->Exit();
 
   if (!configStandalone.noprompt) {

From 636af961502e34773c033813f222241b55780800 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Mon, 18 Dec 2023 10:16:20 +0100
Subject: [PATCH 17/25] Created tmp buffers to store native clusters per row
 and per slice

---
 GPU/GPUTracking/CMakeLists.txt                |  2 +-
 .../DataCompression/GPUTPCDecompression.cxx   | 24 ++++++++++++++++++-
 .../DataCompression/GPUTPCDecompression.h     | 18 +++++++++++++-
 .../GPUTPCDecompressionKernels.cxx            | 22 +++++++++++++++--
 .../Global/GPUChainTrackingCompression.cxx    |  7 +++---
 .../Standalone/Benchmark/standalone.cxx       |  2 +-
 6 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index db2d3f7c0accd..18a187bf5f64d 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -11,7 +11,7 @@
 
 set(MODULE GPUTracking)
 
-# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -O0") # to uncomment if needed, tired of typing this...
+set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
 
 include(cmake/helpers.cmake)
 
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index 03595069a52f0..22efcea7f774d 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -65,11 +65,33 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign
   computePointerWithAlignment(mem, c.nTrackClusters, nTr);
 }
 
+void* GPUTPCDecompression::SetPointersTmpNativeBuffers(void* mem){
+  computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT);
+  computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer);
+  //computePointerWithAlignment(mem,tmpBuffer,NSLICES * GPUCA_ROW_COUNT);
+  return mem;
+}
+
 void GPUTPCDecompression::RegisterMemoryAllocation() {
   AllocateAndInitializeLate();
   mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput");
+  mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffers,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffers");
 }
 
 void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){
+  mMaxNativeClustersPerBuffer = 81760;
+}
+/*
+GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer():
+mCmprClsBuffer{new o2::tpc::ClusterNative[mCapacity]},mIndex{0}
+{}
 
-}
\ No newline at end of file
+void GPUTPCDecompression::ConcurrentClusterNativeBuffer::push_back(tpc::ClusterNative cluster)
+{
+  if(mIndex == mCapacity){
+    //reallocate?
+    return;
+  }
+  unsigned int current = CAMath::AtomicAdd(mIndex, 1u);
+  mTmpNativeClusters[current] = cluster;
+}*/
\ No newline at end of file
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 42bbb6390f5db..1ec3638429629 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -19,6 +19,7 @@
 #include "GPUProcessor.h"
 #include "GPUCommonMath.h"
 #include "GPUParam.h"
+#include "GPUO2DataTypes.h"
 
 #ifdef GPUCA_HAVE_O2HEADERS
 #include "DataFormatsTPC/CompressedClusters.h"
@@ -49,16 +50,31 @@ class GPUTPCDecompression : public GPUProcessor
   void SetMaxData(const GPUTrackingInOutPointers& io);
 
   void* SetPointersInputGPU(void* mem);
-  void* SetPointersMemory(void* mem);
+  void* SetPointersTmpNativeBuffers(void* mem);
 #endif
 
  protected:
   constexpr static unsigned int NSLICES = GPUCA_NSLICES;
   o2::tpc::CompressedClusters mInputGPU;
+  unsigned int mMaxNativeClustersPerBuffer;
+  unsigned int* mNativeClustersIndex;
+  o2::tpc::ClusterNative* mTmpNativeClusters;
+/*  class ConcurrentClusterNativeBuffer{
+    size_t mIndex;
+    size_t mCapacity = 10;
+    o2::tpc::ClusterNative* mCmprClsBuffer;
+   public:
+    ConcurrentClusterNativeBuffer();
+    void push_back(ClusterNative cluster);
+  };*/
+//  ConcurrentClusterNativeBuffer* tmpBuffer;
+
   template <class T>
   void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);
 
   short mMemoryResInputGPU = -1;
+
+
 };
 } // namespace GPUCA_NAMESPACE::gpu
 #endif // GPUTPCDECOMPRESSION_H
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index 65c6fc0e6587c..bc3426fed134c 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -23,10 +23,28 @@ using namespace o2::tpc;
 template <>
 GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::test>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
 {
-  GPUTPCCompression& GPUrestrict() compressor = processors.tpcCompressor;
   GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
+  CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
+  const GPUParam& GPUrestrict() param = processors.param;
+
+  unsigned int offset = 0, lasti = 0;
+  const unsigned int maxTime = (param.par.continuousMaxTimeBin + 1) * ClusterNative::scaleTimePacked - 1;
+
+  for (unsigned int i = get_global_id(0); i < cmprClusters.nTracks; i += get_global_size(0)) {
+    while (lasti < i) {
+      offset += cmprClusters.nTrackClusters[lasti++];
+    }
+    lasti++;
+    //decompressTrack(clustersCompressed, param, maxTime, i, offset, clusters, locks);
+
+  }
   if (!iThread && !iBlock) {
-    GPUInfo("==== on GPU nAttCl = {%d}, nUnAttCl = {%d}, nTracks = {%d}",decompressor.mInputGPU.nAttachedClusters,decompressor.mInputGPU.nUnattachedClusters,decompressor.mInputGPU.nTracks);
+    GPUInfo("==== on GPU nAttCl = %d, nUnAttCl = %d, nTracks = %d",cmprClusters.nAttachedClusters,cmprClusters.nUnattachedClusters,cmprClusters.nTracks);
+    GPUInfo("=== sizeof(CluserNative) = %lu", sizeof(ClusterNative));
+    /*int * test = new int[10];
+    test[0] = 1;
+    GPUInfo("==== got it %p -- %d",(void*)test,test[0]);
+    delete[] test;*/
   }
 
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 10f8565992815..4206ea8bbf5f7 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -233,13 +233,12 @@ int GPUChainTracking::RunTPCDecompression()
   TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
 
   int outputStream = 0;
+  bool toGPU = true;
   CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
   //char* deviceFlatPts = (char*)inputGPUShadow.qTotU;
   //GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true);
   SynchronizeStream(outputStream);
 
-  unsigned int offset = 0;
-  bool toGPU = true;
 
   GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU);
   GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU);
@@ -266,6 +265,8 @@ int GPUChainTracking::RunTPCDecompression()
   GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, toGPU);
   GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU);
 
+  runKernel<GPUMemClean16>(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0]));
+
 
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
   /*
@@ -292,7 +293,7 @@ int GPUChainTracking::RunTPCDecompression()
   gatherTimer.Stop();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   if (mRec->IsGPU()) {
-    runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>({1,1,0}, krnlRunRangeNone, krnlEventNone);
+    runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
 
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
     processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index 18fb7e9aeacea..04bf75f39bab3 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -948,7 +948,7 @@ int main(int argc, char** argv)
       printf("Error unregistering memory\n");
     }
   }
-  exit(0);
+  //exit(0);
   rec->Exit();
 
   if (!configStandalone.noprompt) {

From 2d82384f7ffdcbeb15cd56cbbfe553a8c65b634a Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Wed, 3 Jan 2024 22:34:43 +0100
Subject: [PATCH 18/25] Decoding of attached clusters on GPU

---
 .../DataCompression/GPUTPCDecompression.cxx   |  22 +++-
 .../DataCompression/GPUTPCDecompression.h     |  11 +-
 .../GPUTPCDecompressionKernels.cxx            | 118 ++++++++++++++++--
 .../GPUTPCDecompressionKernels.h              |  23 +++-
 .../TPCClusterDecompressor.cxx                |   6 +
 .../Definitions/GPUDefGPUParameters.h         |  12 +-
 .../Global/GPUChainTrackingCompression.cxx    |  47 +++++--
 7 files changed, 203 insertions(+), 36 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index 22efcea7f774d..922bfc120d136 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -65,21 +65,33 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign
   computePointerWithAlignment(mem, c.nTrackClusters, nTr);
 }
 
-void* GPUTPCDecompression::SetPointersTmpNativeBuffers(void* mem){
-  computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT);
+void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){
   computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer);
-  //computePointerWithAlignment(mem,tmpBuffer,NSLICES * GPUCA_ROW_COUNT);
+  //computePointerWithAlignment(mem,mClusterNativeAccess);
+  return mem;
+}
+
+void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem){
+  computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT);
+  return mem;
+}
+
+void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem){
+  computePointerWithAlignment(mem,mUnattachedClustersOffsets,NSLICES * GPUCA_ROW_COUNT);
   return mem;
 }
 
 void GPUTPCDecompression::RegisterMemoryAllocation() {
   AllocateAndInitializeLate();
   mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput");
-  mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffers,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffers");
+  mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersGPU,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffersGPU");
+  mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersOutput,GPUMemoryResource::MEMORY_OUTPUT,"TPCDecompressionTmpBuffersOutput");
+  mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersInput,GPUMemoryResource::MEMORY_INPUT,"TPCDecompressionTmpBuffersInput");
 }
 
 void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){
-  mMaxNativeClustersPerBuffer = 81760;
+  //mMaxNativeClustersPerBuffer = 81760;
+  mMaxNativeClustersPerBuffer = 12000;
 }
 /*
 GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer():
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 1ec3638429629..6d3225088ed56 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -50,15 +50,21 @@ class GPUTPCDecompression : public GPUProcessor
   void SetMaxData(const GPUTrackingInOutPointers& io);
 
   void* SetPointersInputGPU(void* mem);
-  void* SetPointersTmpNativeBuffers(void* mem);
+  void* SetPointersTmpNativeBuffersGPU(void* mem);
+  void* SetPointersTmpNativeBuffersOutput(void* mem);
+  void* SetPointersTmpNativeBuffersInput(void* mem);
+
 #endif
 
  protected:
   constexpr static unsigned int NSLICES = GPUCA_NSLICES;
   o2::tpc::CompressedClusters mInputGPU;
+
   unsigned int mMaxNativeClustersPerBuffer;
   unsigned int* mNativeClustersIndex;
+  unsigned int* mUnattachedClustersOffsets;
   o2::tpc::ClusterNative* mTmpNativeClusters;
+  o2::tpc::ClusterNativeAccess* mClusterNativeAccess;
 /*  class ConcurrentClusterNativeBuffer{
     size_t mIndex;
     size_t mCapacity = 10;
@@ -73,7 +79,8 @@ class GPUTPCDecompression : public GPUProcessor
   void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);
 
   short mMemoryResInputGPU = -1;
-
+  short mResourceTmpIndexes = -1;
+  short mResourceTmpClustersOffsets = -1;
 
 };
 } // namespace GPUCA_NAMESPACE::gpu
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index bc3426fed134c..6867286d25e64 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -15,14 +15,13 @@
 #include "GPUTPCDecompressionKernels.h"
 #include "GPULogging.h"
 #include "GPUConstantMem.h"
-
+#include "GPUTPCCompressionTrackModel.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
 
 template <>
-GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::test>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
-{
+GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step0attached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
   GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
   CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
   const GPUParam& GPUrestrict() param = processors.param;
@@ -35,16 +34,109 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::tes
       offset += cmprClusters.nTrackClusters[lasti++];
     }
     lasti++;
-    //decompressTrack(clustersCompressed, param, maxTime, i, offset, clusters, locks);
-
+    decompressTrack(cmprClusters, param, maxTime, i, offset, decompressor);
   }
-  if (!iThread && !iBlock) {
-    GPUInfo("==== on GPU nAttCl = %d, nUnAttCl = %d, nTracks = %d",cmprClusters.nAttachedClusters,cmprClusters.nUnattachedClusters,cmprClusters.nTracks);
-    GPUInfo("=== sizeof(CluserNative) = %lu", sizeof(ClusterNative));
-    /*int * test = new int[10];
-    test[0] = 1;
-    GPUInfo("==== got it %p -- %d",(void*)test,test[0]);
-    delete[] test;*/
+}
+
+GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor){
+  float zOffset = 0;
+  unsigned int slice = cmprClusters.sliceA[trackIndex];
+  unsigned int row = cmprClusters.rowA[trackIndex];
+  GPUTPCCompressionTrackModel track;
+  unsigned int clusterIndex;
+  for(clusterIndex = 0; clusterIndex < cmprClusters.nTrackClusters[trackIndex]; clusterIndex++){
+    unsigned int pad = 0, time = 0;
+    if(clusterIndex != 0){
+      unsigned char tmpSlice = cmprClusters.sliceLegDiffA[clusterOffset - trackIndex -1];
+      bool changeLeg = (tmpSlice >= GPUCA_NSLICES);
+      if(changeLeg){
+        tmpSlice -= GPUCA_NSLICES;
+      }
+      if(cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences){
+        slice += tmpSlice;
+        if(slice >= GPUCA_NSLICES){
+          slice -= GPUCA_NSLICES;
+        }
+        row += cmprClusters.rowDiffA[clusterOffset -trackIndex -1];
+        if(row >= GPUCA_ROW_COUNT){
+          row -= GPUCA_ROW_COUNT;
+        }
+      } else {
+        slice = tmpSlice;
+        row = cmprClusters.rowDiffA[clusterOffset -trackIndex -1];
+      }
+      if (changeLeg && track.Mirror()) {
+        break;
+      }
+      if (track.Propagate(param.tpcGeometry.Row2X(row),param.SliceParam[slice].Alpha)){
+        break;
+      }
+      unsigned int timeTmp = cmprClusters.timeResA[clusterOffset -trackIndex -1];
+      if (timeTmp & 800000) {
+        timeTmp |= 0xFF000000;
+      }
+      time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f,param.tpcGeometry.LinearZ2Time(slice,track.Z() + zOffset)));
+      float tmpPad = CAMath::Max(0.f, CAMath::Min((float)param.tpcGeometry.NPads(GPUCA_ROW_COUNT - 1), param.tpcGeometry.LinearY2Pad(slice, row, track.Y())));
+      pad = cmprClusters.padResA[clusterOffset -trackIndex - 1] + ClusterNative::packPad(tmpPad);
+      time = time & 0xFFFFFF;
+      pad = (unsigned short)pad;
+      if (pad >= param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked) {
+        if (pad >= 0xFFFF - 11968) { // Constant 11968 = (2^15 - MAX_PADS(138) * scalePadPacked(64)) / 2
+          pad = 0;
+        } else {
+          pad = param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked - 1;
+        }
+      }
+      if (param.par.continuousMaxTimeBin > 0 && time >= maxTime) {
+        if (time >= 0xFFFFFF - 544768) { // Constant 544768 = (2^23 - LHCMAXBUNCHES(3564) * MAXORBITS(256) * scaleTimePacked(64) / BCPERTIMEBIN(8)) / 2)
+          time = 0;
+        } else {
+          time = maxTime;
+        }
+      }
+    } else {
+      time = cmprClusters.timeA[trackIndex];
+      pad = cmprClusters.padA[trackIndex];
+    }
+    bool stored;
+    const auto cluster = decompressTrackStore(cmprClusters, clusterOffset, slice, row, pad, time, decompressor, stored);
+    float y = param.tpcGeometry.LinearPad2Y(slice, row, cluster.getPad());
+    float z = param.tpcGeometry.LinearTime2Z(slice, cluster.getTime());
+    if(clusterIndex == 0){
+      zOffset = z;
+      track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex],param);
+    }
+    if(clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y,z-zOffset,row)){
+      break;
+    }
+    clusterOffset++;
   }
+  clusterOffset += cmprClusters.nTrackClusters[trackIndex] - clusterIndex;
+}
 
-}
\ No newline at end of file
+GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored){
+  unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
+  unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row),1u);
+  const ClusterNative c(time, cmprClusters.flagsA[clusterOffset], pad, cmprClusters.sigmaTimeA[clusterOffset], cmprClusters.sigmaPadA[clusterOffset], cmprClusters.qMaxA[clusterOffset], cmprClusters.qTotA[clusterOffset]);
+  stored = currentClusterIndex < decompressor.mMaxNativeClustersPerBuffer;
+  if(stored){
+    decompressor.mTmpNativeClusters[tmpBufferIndex + currentClusterIndex] = c;
+  }
+  return c;
+}
+/*
+template <>
+GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::prepareAccess>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
+  for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){
+    GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
+    CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
+    o2::tpc::ClusterNativeAccess& clustersNative = *decompressor.mClusterNativeAccess;
+    unsigned int slice = i / GPUCA_ROW_COUNT;
+    unsigned int row = i % GPUCA_ROW_COUNT;
+    unsigned int unattachedOffset = (i >= cmprClusters.nSliceRows) ? 0 : cmprClusters.nSliceRowClusters[i];
+    (clustersNative.nClusters)[slice][row] = decompressor.mNativeClustersIndex[i] + unattachedOffset;
+    for(unsigned int j = i+1; j < GPUCA_NSLICES * GPUCA_ROW_COUNT; j++){
+      CAMath::AtomicAdd(decompressor.mUnattachedClustersOffsets + j,unattachedOffset);
+    }
+  }
+}*/
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
index 6be320026866a..3fd235390c6a7 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
@@ -16,6 +16,19 @@
 #define GPUTPCDECOMPRESSIONKERNELS_H
 
 #include "GPUGeneralKernels.h"
+#include "GPUO2DataTypes.h"
+#include "GPUParam.h"
+#include "GPUConstantMem.h"
+
+#ifdef GPUCA_HAVE_O2HEADERS
+#include "DataFormatsTPC/CompressedClusters.h"
+#else
+namespace o2::tpc
+{
+struct CompressedClusters {
+};
+} // namespace o2::tpc
+#endif
 
 namespace GPUCA_NAMESPACE::gpu
 {
@@ -26,11 +39,19 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate
   GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCDecompression; }
 
   enum K : int {
-    test = 0,
+    step0attached = 0,
+    step1unattached = 1,
+    prepareAccess = 2
   };
 
   template <int iKernel = defaultKernel>
   GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors);
+  GPUd() static void decompressTrack(o2::tpc::CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor);
+  GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored);
+
+  GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){
+    return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer;
+  }
 };
 
 }
diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
index b8f491a6f5767..bd6714433161a 100644
--- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
+++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
@@ -77,6 +77,12 @@ int TPCClusterDecompressor::decompress(const CompressedClusters* clustersCompres
       decodedAttachedClusters += clusters[i][j].size();
     }
   }
+  unsigned int avgAttCl = 0;
+  for(unsigned int i = 0; i < NSLICES; i++)
+    for (unsigned int j=0; j < GPUCA_ROW_COUNT; j++)
+      avgAttCl += clusters[i][j].size();
+  avgAttCl/=(NSLICES*GPUCA_ROW_COUNT);
+  LOGP(info,"===== avg on Host: {}",avgAttCl);
   if (decodedAttachedClusters != clustersCompressed->nAttachedClusters) {
     GPUWarning("%u / %u clusters failed track model decoding (%f %%)", clustersCompressed->nAttachedClusters - decodedAttachedClusters, clustersCompressed->nAttachedClusters, 100.f * (float)(clustersCompressed->nAttachedClusters - decodedAttachedClusters) / (float)clustersCompressed->nAttachedClusters);
   }
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index d88fc055099fb..b3cc4d8dfe29f 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -71,7 +71,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
-  #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -136,7 +136,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
-  #define GPUCA_LB_GPUTPCDecompressionKernels_test 192, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 192, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -200,7 +200,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
-  #define GPUCA_LB_GPUTPCDecompressionKernels_test 64, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448
@@ -265,7 +265,7 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128
-  #define GPUCA_LB_GPUTPCDecompressionKernels_test 128
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
@@ -326,8 +326,8 @@
   #ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached
     #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 256
   #endif
-  #ifndef GPUCA_LB_GPUTPCDecompressionKernels_test
-    #define GPUCA_LB_GPUTPCDecompressionKernels_test 256
+  #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step0attached
+    #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 256
   #endif
   #ifndef GPUCA_LB_GPUTPCCFDecodeZS
     #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 4206ea8bbf5f7..1198d9696f417 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -230,15 +230,15 @@ int GPUChainTracking::RunTPCDecompression()
 
   size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU);
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
-  TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
+  //TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
 
   int outputStream = 0;
   bool toGPU = true;
   CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
-  //char* deviceFlatPts = (char*)inputGPUShadow.qTotU;
-  //GPUMemCpy(myStep, deviceFlatPts, cmprClsHost.qTotU, copySize, outputStream, true);
   SynchronizeStream(outputStream);
 
+  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, cmprClsHost.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, toGPU);
 
   GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU);
   GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU);
@@ -266,19 +266,50 @@ int GPUChainTracking::RunTPCDecompression()
   GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU);
 
   runKernel<GPUMemClean16>(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0]));
-
+  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step0attached>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
 
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
-  /*
+
   AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput);
   AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
+  TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, 0);
+  LOGP(info,"==== Host {} -- Shadow {}",(void*)Decompressor.mNativeClustersIndex,(void*)DecompressorShadow.mNativeClustersIndex);
+  SynchronizeStream(0);
+
+  unsigned int offset = 0;
+  unsigned int decodedAttachedClusters = 0;
+  for (unsigned int i = 0; i < NSLICES; i++) {
+    for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
+      unsigned int linearIndex = i * GPUCA_ROW_COUNT + j;
+      unsigned int unattachedOffset = (linearIndex >= cmprClsHost.nSliceRows) ? 0 : cmprClsHost.nSliceRowClusters[linearIndex];
+      (mClusterNativeAccess->nClusters)[i][j] = Decompressor.mNativeClustersIndex[linearIndex] + unattachedOffset;
+      Decompressor.mUnattachedClustersOffsets[linearIndex] = offset;
+      offset += unattachedOffset;
+      decodedAttachedClusters += Decompressor.mNativeClustersIndex[linearIndex];
+    }
+  }
+  LOGP(info,"decoded = {}",decodedAttachedClusters);
+  if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
+         GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
+       }/*
+  mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
+  mClusterNativeAccess->setOffsetPtrs();
+  mIOPtrs.clustersNative = mClusterNativeAccess.get();
+  *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
   processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
   WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
+  TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0);
+  mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
+  mClusterNativeAccess->setOffsetPtrs();
+  *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
+  processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess;
 */
+
+
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
     this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;
-    this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
+    //this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
     return this->mInputsHost->mPclusterNativeOutput;
   };
   auto& gatherTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
@@ -293,9 +324,7 @@ int GPUChainTracking::RunTPCDecompression()
   gatherTimer.Stop();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   if (mRec->IsGPU()) {
-    runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::test>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
-
-    AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
+    //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
     processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
     WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
     *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;

From eee0fa513787d51dadf1ea3f24269697bd77c4b6 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Thu, 4 Jan 2024 12:28:34 +0100
Subject: [PATCH 19/25] removed debugging info in CPU decompressor

---
 .../DataCompression/TPCClusterDecompressor.cxx            | 6 ------
 GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx    | 8 +++++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
index bd6714433161a..b8f491a6f5767 100644
--- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
+++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
@@ -77,12 +77,6 @@ int TPCClusterDecompressor::decompress(const CompressedClusters* clustersCompres
       decodedAttachedClusters += clusters[i][j].size();
     }
   }
-  unsigned int avgAttCl = 0;
-  for(unsigned int i = 0; i < NSLICES; i++)
-    for (unsigned int j=0; j < GPUCA_ROW_COUNT; j++)
-      avgAttCl += clusters[i][j].size();
-  avgAttCl/=(NSLICES*GPUCA_ROW_COUNT);
-  LOGP(info,"===== avg on Host: {}",avgAttCl);
   if (decodedAttachedClusters != clustersCompressed->nAttachedClusters) {
     GPUWarning("%u / %u clusters failed track model decoding (%f %%)", clustersCompressed->nAttachedClusters - decodedAttachedClusters, clustersCompressed->nAttachedClusters, 100.f * (float)(clustersCompressed->nAttachedClusters - decodedAttachedClusters) / (float)clustersCompressed->nAttachedClusters);
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 1198d9696f417..e3bc029e9a469 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -273,7 +273,6 @@ int GPUChainTracking::RunTPCDecompression()
   AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput);
   AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
   TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, 0);
-  LOGP(info,"==== Host {} -- Shadow {}",(void*)Decompressor.mNativeClustersIndex,(void*)DecompressorShadow.mNativeClustersIndex);
   SynchronizeStream(0);
 
   unsigned int offset = 0;
@@ -291,9 +290,12 @@ int GPUChainTracking::RunTPCDecompression()
   LOGP(info,"decoded = {}",decodedAttachedClusters);
   if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
          GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
-       }/*
+  } else {
+         GPUInfo("My version: all attached clusters have been decoded");
+  }
+
   mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
-  mClusterNativeAccess->setOffsetPtrs();
+  mClusterNativeAccess->setOffsetPtrs();/*
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
   processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;

From 5b48d25bc82dab3ef992e63ff129e56e62af14c6 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Fri, 5 Jan 2024 10:28:16 +0100
Subject: [PATCH 20/25] kernel for unattached clusters (not working)

---
 .../GPUTPCDecompressionKernels.cxx            | 34 +++++++++++++++++++
 .../Definitions/GPUDefGPUParameters.h         |  7 ++++
 .../Global/GPUChainTrackingCompression.cxx    |  5 +--
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index 6867286d25e64..cd68af30f844c 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -124,6 +124,40 @@ GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2
   }
   return c;
 }
+
+template <>
+GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
+  GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
+  const ClusterNative* GPUrestrict() clusterBuffer = processors.ioPtrs.clustersNative->clustersLinear;
+  unsigned int* offsets = decompressor.mUnattachedClustersOffsets;
+  for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){
+    unsigned int slice = i / GPUCA_ROW_COUNT;
+    unsigned int row = i % GPUCA_ROW_COUNT;
+    unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
+    const ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row];
+    if (decompressor.mNativeClustersIndex[i] != 0) {
+      memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0]));
+    }
+    const ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i];
+    unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]);
+    //decompressHits(clustersCompressed, offsets[i][j], end, clout);
+    if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) {
+      for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) {
+        auto& cl = buffer[k];
+        float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins;
+        if (t < 0) {
+          t = 0;
+        }
+        if (processors.param.par.continuousMaxTimeBin > 0 && t > processors.param.par.continuousMaxTimeBin) {
+          t = processors.param.par.continuousMaxTimeBin;
+        }
+        cl.setTime(t);
+      }
+    }
+    //std::sort(buffer, buffer + clustersNative.nClusters[i][j]);
+  }
+
+}
 /*
 template <>
 GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::prepareAccess>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index b3cc4d8dfe29f..395bdec043331 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -72,6 +72,7 @@
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -137,6 +138,7 @@
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 192, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -202,6 +204,7 @@
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 3
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 448
@@ -267,6 +270,7 @@
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128
   #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -329,6 +333,9 @@
   #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step0attached
     #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 256
   #endif
+  #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step1unattached
+    #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 256
+  #endif
   #ifndef GPUCA_LB_GPUTPCCFDecodeZS
     #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4
   #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index e3bc029e9a469..4cf58e390b332 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -288,6 +288,7 @@ int GPUChainTracking::RunTPCDecompression()
     }
   }
   LOGP(info,"decoded = {}",decodedAttachedClusters);
+  TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression,Decompressor.mResourceTmpClustersOffsets,0);
   if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
          GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
   } else {
@@ -295,7 +296,7 @@ int GPUChainTracking::RunTPCDecompression()
   }
 
   mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
-  mClusterNativeAccess->setOffsetPtrs();/*
+  mClusterNativeAccess->setOffsetPtrs();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
   processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
@@ -305,8 +306,8 @@ int GPUChainTracking::RunTPCDecompression()
   mClusterNativeAccess->setOffsetPtrs();
   *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
   processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess;
-*/
 
+  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({1,1,0}, krnlRunRangeNone, krnlEventNone);
 
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {

From 7634c61af6f9be7e17937eaa24e23c954aec94a3 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Tue, 9 Jan 2024 17:23:08 +0100
Subject: [PATCH 21/25] Added sorting algorithm

---
 .../DataCompression/GPUTPCDecompression.h     |  1 +
 .../GPUTPCDecompressionKernels.cxx            | 32 ++++++++++++++++---
 .../GPUTPCDecompressionKernels.h              |  1 +
 .../Global/GPUChainTrackingCompression.cxx    | 13 +++++++-
 4 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index 6d3225088ed56..e4b2cca65c007 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -65,6 +65,7 @@ class GPUTPCDecompression : public GPUProcessor
   unsigned int* mUnattachedClustersOffsets;
   o2::tpc::ClusterNative* mTmpNativeClusters;
   o2::tpc::ClusterNativeAccess* mClusterNativeAccess;
+  o2::tpc::ClusterNative* mNativeClustersBuffer;
 /*  class ConcurrentClusterNativeBuffer{
     size_t mIndex;
     size_t mCapacity = 10;
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index cd68af30f844c..033a38391dec7 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -16,6 +16,7 @@
 #include "GPULogging.h"
 #include "GPUConstantMem.h"
 #include "GPUTPCCompressionTrackModel.h"
+#include "GPUCommonAlgorithm.h"
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
@@ -128,19 +129,20 @@ GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2
 template <>
 GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
   GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
-  const ClusterNative* GPUrestrict() clusterBuffer = processors.ioPtrs.clustersNative->clustersLinear;
+  CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
+  ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
   unsigned int* offsets = decompressor.mUnattachedClustersOffsets;
   for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){
     unsigned int slice = i / GPUCA_ROW_COUNT;
     unsigned int row = i % GPUCA_ROW_COUNT;
     unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
-    const ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row];
+    ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row];
     if (decompressor.mNativeClustersIndex[i] != 0) {
       memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0]));
     }
-    const ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i];
+    ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i];
     unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]);
-    //decompressHits(clustersCompressed, offsets[i][j], end, clout);
+    decompressHits(cmprClusters, offsets[i], end, clout);
     if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) {
       for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) {
         auto& cl = buffer[k];
@@ -154,10 +156,30 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
         cl.setTime(t);
       }
     }
-    //std::sort(buffer, buffer + clustersNative.nClusters[i][j]);
+    GPUCommonAlgorithm::sort(buffer, buffer + processors.ioPtrs.clustersNative->nClusters[slice][row]);
   }
 
 }
+
+GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer){
+  unsigned int time = 0;
+  unsigned short pad = 0;
+  for (unsigned int k = start; k < end; k++) {
+    if (cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences) {
+      unsigned int timeTmp = cmprClusters.timeDiffU[k];
+      if (timeTmp & 800000) {
+        timeTmp |= 0xFF000000;
+      }
+      time += timeTmp;
+      pad += cmprClusters.padDiffU[k];
+    } else {
+      time = cmprClusters.timeDiffU[k];
+      pad = cmprClusters.padDiffU[k];
+    }
+   *(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]);
+  }
+}
+
 /*
 template <>
 GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::prepareAccess>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
index 3fd235390c6a7..e70f7486bebbf 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
@@ -48,6 +48,7 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate
   GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors);
   GPUd() static void decompressTrack(o2::tpc::CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor);
   GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored);
+  GPUdi() static void decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, o2::tpc::ClusterNative* clusterNativeBuffer);
 
   GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){
     return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 4cf58e390b332..f924602baaa5d 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -295,19 +295,26 @@ int GPUChainTracking::RunTPCDecompression()
          GPUInfo("My version: all attached clusters have been decoded");
   }
 
+  Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
+  DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
   mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
   mClusterNativeAccess->setOffsetPtrs();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
   processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
   WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
+  WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
   TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0);
   mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
   mClusterNativeAccess->setOffsetPtrs();
   *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
   processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess;
 
-  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({1,1,0}, krnlRunRangeNone, krnlEventNone);
+  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone);
+
+  ClusterNative* tmpBuffer = new ClusterNative[mInputsHost->mNClusterNative];
+  //GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false);
+  GPUMemCpy(RecoStep::TPCDecompression,tmpBuffer,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false);
 
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
@@ -337,6 +344,10 @@ int GPUChainTracking::RunTPCDecompression()
     TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0);
     SynchronizeStream(0);
   }
+
+  LOGP(info,"==== My version/cpu version: {}/{}", tmpBuffer[0].qTot,mInputsHost->mPclusterNativeOutput[0].qTot);
+
+  delete[] tmpBuffer;
   // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR"));
 //#endif
   return 0;

From 19cbc626dafaeb9df5307bec7aff6593ad2c4a81 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Thu, 25 Jan 2024 10:07:21 +0100
Subject: [PATCH 22/25] hard-coded tests -- need to remove after validaiton

---
 .../include/DataFormatsTPC/ClusterNative.h    | 11 +++
 GPU/CMakeLists.txt                            |  4 +-
 GPU/GPUTracking/CMakeLists.txt                |  2 +-
 .../DataCompression/GPUTPCDecompression.h     |  9 ---
 .../GPUTPCDecompressionKernels.cxx            |  8 ++-
 .../Global/GPUChainTrackingCompression.cxx    | 67 +++++++++++++++++--
 .../Standalone/Benchmark/standalone.cxx       |  2 +-
 7 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h
index 49d61007092a8..c5e103362f0f4 100644
--- a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h
+++ b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h
@@ -156,6 +156,17 @@ struct ClusterNative {
       return (this->getFlags() < rhs.getFlags());
     }
   }
+
+  GPUd() bool operator==(const ClusterNative& rhs) const
+  {
+    return this->getTimePacked() == rhs.getTimePacked() &&
+           this->padPacked == rhs.padPacked &&
+           this->sigmaTimePacked == rhs.sigmaTimePacked &&
+           this->sigmaPadPacked == rhs.sigmaPadPacked &&
+           this->qMax == rhs.qMax &&
+           this->qTot == rhs.qTot &&
+           this->getFlags() == rhs.getFlags();
+  }
 };
 
 // This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to
diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt
index 6e08e9582761a..308c9f161c90f 100644
--- a/GPU/CMakeLists.txt
+++ b/GPU/CMakeLists.txt
@@ -18,8 +18,10 @@
 # HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers
 # for installation only
 
+set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
+
 if(NOT DEFINED GPUCA_NO_FAST_MATH)
-  set(GPUCA_NO_FAST_MATH 0)
+  set(GPUCA_NO_FAST_MATH 1)
 endif()
 if(${GPUCA_NO_FAST_MATH})
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math -ffp-contract=off")
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 18a187bf5f64d..200ac832a433e 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -11,7 +11,7 @@
 
 set(MODULE GPUTracking)
 
-set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
+# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
 
 include(cmake/helpers.cmake)
 
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
index e4b2cca65c007..d2cb749dd79c3 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -66,15 +66,6 @@ class GPUTPCDecompression : public GPUProcessor
   o2::tpc::ClusterNative* mTmpNativeClusters;
   o2::tpc::ClusterNativeAccess* mClusterNativeAccess;
   o2::tpc::ClusterNative* mNativeClustersBuffer;
-/*  class ConcurrentClusterNativeBuffer{
-    size_t mIndex;
-    size_t mCapacity = 10;
-    o2::tpc::ClusterNative* mCmprClsBuffer;
-   public:
-    ConcurrentClusterNativeBuffer();
-    void push_back(ClusterNative cluster);
-  };*/
-//  ConcurrentClusterNativeBuffer* tmpBuffer;
 
   template <class T>
   void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index 033a38391dec7..f222350d574c7 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -131,12 +131,14 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
   GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
   CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
   ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
+  const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
+
   unsigned int* offsets = decompressor.mUnattachedClustersOffsets;
   for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){
     unsigned int slice = i / GPUCA_ROW_COUNT;
     unsigned int row = i % GPUCA_ROW_COUNT;
     unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
-    ClusterNative* buffer = clusterBuffer + processors.ioPtrs.clustersNative->clusterOffset[slice][row];
+    ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[slice][row];
     if (decompressor.mNativeClustersIndex[i] != 0) {
       memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0]));
     }
@@ -144,7 +146,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
     unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]);
     decompressHits(cmprClusters, offsets[i], end, clout);
     if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) {
-      for (unsigned int k = 0; k < processors.ioPtrs.clustersNative->nClusters[slice][row]; k++) {
+      for (unsigned int k = 0; k < outputAccess->nClusters[slice][row]; k++) {
         auto& cl = buffer[k];
         float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins;
         if (t < 0) {
@@ -156,7 +158,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
         cl.setTime(t);
       }
     }
-    GPUCommonAlgorithm::sort(buffer, buffer + processors.ioPtrs.clustersNative->nClusters[slice][row]);
+    GPUCommonAlgorithm::sort(buffer, buffer + outputAccess->nClusters[slice][row]);
   }
 
 }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index f924602baaa5d..3bfaf33f0dbba 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -29,7 +29,7 @@ using namespace o2::tpc;
 int GPUChainTracking::RunTPCCompression()
 {
   LOGP(info, "====== Compression");
-//#ifdef GPUCA_HAVE_O2HEADERS
+#ifdef GPUCA_HAVE_O2HEADERS
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR"));
   RecoStep myStep = RecoStep::TPCCompression;
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCCompression;
@@ -200,7 +200,7 @@ int GPUChainTracking::RunTPCCompression()
     ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec);
   }
   mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR"));
-//#endif
+#endif
   return 0;
 }
 
@@ -208,8 +208,14 @@ int GPUChainTracking::RunTPCDecompression()
 {
   LOGP(info, "====== Decompression");
 
-//#ifdef GPUCA_HAVE_O2HEADERS
+#ifdef GPUCA_HAVE_O2HEADERS
  // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
+  ClusterNativeAccess* original = new ClusterNativeAccess;
+  original->clustersLinear = new ClusterNative[mIOPtrs.clustersNative->nClustersTotal];
+  memcpy((void*)original->clustersLinear,mIOPtrs.clustersNative->clustersLinear,mIOPtrs.clustersNative->nClustersTotal*sizeof(mIOPtrs.clustersNative->clustersLinear[0]));
+  memcpy((void*)original->nClusters,mIOPtrs.clustersNative->nClusters,NSLICES*GPUCA_ROW_COUNT*sizeof(mIOPtrs.clustersNative->nClusters[0][0]));
+  original->setOffsetPtrs();
+
   RecoStep myStep = RecoStep::TPCDecompression;
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true
   GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
@@ -228,9 +234,14 @@ int GPUChainTracking::RunTPCDecompression()
   inputGPU.maxTimeBin = param().par.continuousMaxTimeBin;
   SetupGPUProcessor(&Decompressor, true);
 
+  for(int k = 0; k < cmprClsHost.nSliceRowClusters[0]; k++){
+    if(cmprClsHost.sigmaTimeU[k] == 14 && cmprClsHost.sigmaPadU[k] == 14 && cmprClsHost.qMaxU[k] == 88 && cmprClsHost.qTotU[k] == 160){
+    LOGP(info,"==== Cluster unattached [{}] sigmaTimeU: {} simgaPadU: {} qMaxU: {} qTotU: {}",k,cmprClsHost.sigmaTimeU[k],cmprClsHost.sigmaPadU[k],cmprClsHost.qMaxU[k],cmprClsHost.qTotU[k]);
+    }
+  }
+
   size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU);
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
-  //TransferMemoryResourcesToGPU(myStep, &Decompressor, 0);
 
   int outputStream = 0;
   bool toGPU = true;
@@ -313,8 +324,11 @@ int GPUChainTracking::RunTPCDecompression()
   runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone);
 
   ClusterNative* tmpBuffer = new ClusterNative[mInputsHost->mNClusterNative];
+  ClusterNativeAccess gpuBuffer = *mInputsHost->mPclusterNativeAccess;
+  gpuBuffer.clustersLinear = tmpBuffer;
   //GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false);
   GPUMemCpy(RecoStep::TPCDecompression,tmpBuffer,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false);
+  gpuBuffer.setOffsetPtrs();
 
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
@@ -345,10 +359,49 @@ int GPUChainTracking::RunTPCDecompression()
     SynchronizeStream(0);
   }
 
-  LOGP(info,"==== My version/cpu version: {}/{}", tmpBuffer[0].qTot,mInputsHost->mPclusterNativeOutput[0].qTot);
-
+  const ClusterNativeAccess* decoded = &gpuBuffer; //mIOPtrs.clustersNative;
+ // original = (ClusterNativeAccess*)mIOPtrs.clustersNative;
+  unsigned int decodingErrors = 0;
+  std::vector<o2::tpc::ClusterNative> tmpClusters;
+  if (param().rec.tpc.rejectionStrategy == GPUSettings::RejectionNone) { // verification does not make sense if we reject clusters during compression
+    for (unsigned int i = 0; i < NSLICES; i++) {
+      for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
+        if (original->nClusters[i][j] != decoded->nClusters[i][j]) {
+          GPUError("Number of clusters mismatch slice %u row %u: expected %d v.s. decoded %d", i, j, original->nClusters[i][j], decoded->nClusters[i][j]);
+          decodingErrors++;
+          continue;
+        }
+        tmpClusters.resize(original->nClusters[i][j]);
+        for (unsigned int k = 0; k < original->nClusters[i][j]; k++) {
+          tmpClusters[k] = original->clusters[i][j][k];
+          if (param().rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
+            GPUTPCCompression::truncateSignificantBitsChargeMax(tmpClusters[k].qMax, param());
+            GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param());
+            GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaPadPacked, param());
+            GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param());
+          }
+        }
+        std::sort(tmpClusters.begin(), tmpClusters.end());
+        for (unsigned int k = 0; k < original->nClusters[i][j]; k++) {
+          const o2::tpc::ClusterNative& c1 = tmpClusters[k];
+          const o2::tpc::ClusterNative& c2 = decoded->clusters[i][j][k];
+          if (!(c1 == c2)) {
+            if (decodingErrors++ < 100) {
+              GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot);
+              GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot);
+            }
+          }
+        }
+      }
+    }
+    if (decodingErrors) {
+      GPUWarning("Errors during cluster decoding %u\n", decodingErrors);
+    } else {
+      GPUInfo("Cluster decoding verification: PASSED");
+    }
+  }
   delete[] tmpBuffer;
   // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR"));
-//#endif
+#endif
   return 0;
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index 04bf75f39bab3..5805bc386ab2e 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -667,7 +667,7 @@ int RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingUse,
         chainTrackingAsync->mIOPtrs.rawClusters[i] = nullptr;
         chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;
       }
-      chainTrackingAsync->mIOPtrs.clustersNative = nullptr;
+      chainTrackingAsync->mIOPtrs.clustersNative = chainTrackingUse->mIOPtrs.clustersNative; //todo: revert back to nullptr
       recAsync->SetResetTimers(iRun < configStandalone.runsInit);
       tmpRetVal = recAsync->RunChains();
       if (tmpRetVal == 0 || tmpRetVal == 2) {

From 2f773de60daedca34d65031f6a9edfed4249271b Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Mon, 29 Jan 2024 15:34:05 +0100
Subject: [PATCH 23/25] code cleaning and adapting to new kernel registration

---
 .../DataCompression/GPUTPCDecompression.cxx     | 17 +----------------
 .../GPUTPCDecompressionKernels.cxx              |  1 +
 .../GPUTPCDecompressionKernels.h                |  2 +-
 .../Global/GPUChainTrackingCompression.cxx      |  3 ---
 GPU/GPUTracking/kernels.cmake                   |  2 ++
 5 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
index 922bfc120d136..c70422a206bf2 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -67,7 +67,6 @@ void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsign
 
 void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){
   computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer);
-  //computePointerWithAlignment(mem,mClusterNativeAccess);
   return mem;
 }
 
@@ -92,18 +91,4 @@ void GPUTPCDecompression::RegisterMemoryAllocation() {
 void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){
   //mMaxNativeClustersPerBuffer = 81760;
   mMaxNativeClustersPerBuffer = 12000;
-}
-/*
-GPUTPCDecompression::ConcurrentClusterNativeBuffer::ConcurrentClusterNativeBuffer():
-mCmprClsBuffer{new o2::tpc::ClusterNative[mCapacity]},mIndex{0}
-{}
-
-void GPUTPCDecompression::ConcurrentClusterNativeBuffer::push_back(tpc::ClusterNative cluster)
-{
-  if(mIndex == mCapacity){
-    //reallocate?
-    return;
-  }
-  unsigned int current = CAMath::AtomicAdd(mIndex, 1u);
-  mTmpNativeClusters[current] = cluster;
-}*/
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
index f222350d574c7..1a726d00ebcd2 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -17,6 +17,7 @@
 #include "GPUConstantMem.h"
 #include "GPUTPCCompressionTrackModel.h"
 #include "GPUCommonAlgorithm.h"
+#include <string.h>
 
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace o2::tpc;
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
index e70f7486bebbf..82041d05fcb7c 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
@@ -41,7 +41,7 @@ class GPUTPCDecompressionKernels : public GPUKernelTemplate
   enum K : int {
     step0attached = 0,
     step1unattached = 1,
-    prepareAccess = 2
+    //prepareAccess = 2
   };
 
   template <int iKernel = defaultKernel>
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 3bfaf33f0dbba..9a384f4ce9d8c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -338,9 +338,6 @@ int GPUChainTracking::RunTPCDecompression()
   };
   auto& gatherTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
   gatherTimer.Start();
-
-  LOGP(info,"==== mIOPtrs.compressed.nAttCl = {}, nUnAttCl = {}, nTracks = {}",cmprClsHost.nAttachedClusters,cmprClsHost.nUnattachedClusters,cmprClsHost.nTracks);
-
   if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param())) {
     GPUError("Error decompressing clusters");
     return 1;
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index b06749010d2d9..d7302f17271d2 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -73,6 +73,8 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered32"   LB      simple)
 o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64"   LB      simple)
 o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128"  LB      simple)
 o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock"   LB      simple)
+o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached"    LB      simple)
+o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached"  LB      simple)
 o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline"                     LB      single)
 o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap"        LB      single)
 o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits"      LB      single)

From d1d648aa6e0cefab9968de9efd965fae0ae6c137 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Tue, 30 Jan 2024 10:39:32 +0100
Subject: [PATCH 24/25] Removed debugging messages and added no-fast-math
 option

---
 GPU/CMakeLists.txt                                  |  2 +-
 GPU/GPUTracking/CMakeLists.txt                      |  1 +
 .../Global/GPUChainTrackingCompression.cxx          | 13 ++-----------
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt
index 308c9f161c90f..a3232b454d5ba 100644
--- a/GPU/CMakeLists.txt
+++ b/GPU/CMakeLists.txt
@@ -18,7 +18,7 @@
 # HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers
 # for installation only
 
-set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
+#set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
 
 if(NOT DEFINED GPUCA_NO_FAST_MATH)
   set(GPUCA_NO_FAST_MATH 1)
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 200ac832a433e..9973720250ad8 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -471,6 +471,7 @@ endif()
 set_source_files_properties(DataCompression/GPUTPCCompressionTrackModel.cxx
                             DataCompression/GPUTPCCompressionKernels.cxx
                             DataCompression/TPCClusterDecompressor.cxx
+                            DataCompression/GPUTPCDecompressionKernels.cxx
                             TARGET_DIRECTORY ${targetName}
                             PROPERTIES COMPILE_FLAGS "-fno-fast-math -ffp-contract=off")
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 9a384f4ce9d8c..e0a4ba3ccd8a2 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -28,7 +28,6 @@ using namespace o2::tpc;
 
 int GPUChainTracking::RunTPCCompression()
 {
-  LOGP(info, "====== Compression");
 #ifdef GPUCA_HAVE_O2HEADERS
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCOMPR"));
   RecoStep myStep = RecoStep::TPCCompression;
@@ -206,8 +205,6 @@ int GPUChainTracking::RunTPCCompression()
 
 int GPUChainTracking::RunTPCDecompression()
 {
-  LOGP(info, "====== Decompression");
-
 #ifdef GPUCA_HAVE_O2HEADERS
  // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
   ClusterNativeAccess* original = new ClusterNativeAccess;
@@ -234,12 +231,6 @@ int GPUChainTracking::RunTPCDecompression()
   inputGPU.maxTimeBin = param().par.continuousMaxTimeBin;
   SetupGPUProcessor(&Decompressor, true);
 
-  for(int k = 0; k < cmprClsHost.nSliceRowClusters[0]; k++){
-    if(cmprClsHost.sigmaTimeU[k] == 14 && cmprClsHost.sigmaPadU[k] == 14 && cmprClsHost.qMaxU[k] == 88 && cmprClsHost.qTotU[k] == 160){
-    LOGP(info,"==== Cluster unattached [{}] sigmaTimeU: {} simgaPadU: {} qMaxU: {} qTotU: {}",k,cmprClsHost.sigmaTimeU[k],cmprClsHost.sigmaPadU[k],cmprClsHost.qMaxU[k],cmprClsHost.qTotU[k]);
-    }
-  }
-
   size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU);
   WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
 
@@ -384,8 +375,8 @@ int GPUChainTracking::RunTPCDecompression()
           const o2::tpc::ClusterNative& c2 = decoded->clusters[i][j][k];
           if (!(c1 == c2)) {
             if (decodingErrors++ < 100) {
-              GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot);
-              GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot);
+              //GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot);
+              //GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot);
             }
           }
         }

From f0249effd4cab2041abe3e49984f007d016ff761 Mon Sep 17 00:00:00 2001
From: cima22 <gabriele.cimador@cern.ch>
Date: Tue, 30 Jan 2024 10:50:43 +0100
Subject: [PATCH 25/25] removed unnecessary files

---
 .../.cmake/api/v1/query/cache-v2              |   0
 .../.cmake/api/v1/query/cmakeFiles-v1         |   0
 .../.cmake/api/v1/query/codemodel-v2          |   0
 .../.cmake/api/v1/query/toolchains-v1         |   0
 cmake-build-debug/DartConfiguration.tcl       | 106 ------------------
 5 files changed, 106 deletions(-)
 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/cache-v2
 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1
 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/codemodel-v2
 delete mode 100644 cmake-build-debug/.cmake/api/v1/query/toolchains-v1
 delete mode 100644 cmake-build-debug/DartConfiguration.tcl

diff --git a/cmake-build-debug/.cmake/api/v1/query/cache-v2 b/cmake-build-debug/.cmake/api/v1/query/cache-v2
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1 b/cmake-build-debug/.cmake/api/v1/query/cmakeFiles-v1
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/cmake-build-debug/.cmake/api/v1/query/codemodel-v2 b/cmake-build-debug/.cmake/api/v1/query/codemodel-v2
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/cmake-build-debug/.cmake/api/v1/query/toolchains-v1 b/cmake-build-debug/.cmake/api/v1/query/toolchains-v1
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/cmake-build-debug/DartConfiguration.tcl b/cmake-build-debug/DartConfiguration.tcl
deleted file mode 100644
index 09b3136cfee5f..0000000000000
--- a/cmake-build-debug/DartConfiguration.tcl
+++ /dev/null
@@ -1,106 +0,0 @@
-# This file is configured by CMake automatically as DartConfiguration.tcl
-# If you choose not to use CMake, this file may be hand configured, by
-# filling in the required variables.
-
-
-# Configuration directories and files
-SourceDirectory: /experiments/alice/cimag/o2/O2
-BuildDirectory: /experiments/alice/cimag/o2/O2/cmake-build-debug
-
-# Where to place the cost data store
-CostDataFile: 
-
-# Site is something like machine.domain, i.e. pragmatic.crd
-Site: gr3srv.ts.infn.it
-
-# Build name is osname-revision-compiler, i.e. Linux-2.4.2-2smp-c++
-BuildName: Linux-c++
-
-# Subprojects
-LabelsForSubprojects: 
-
-# Submission information
-SubmitURL: http://
-SubmitInactivityTimeout: 
-
-# Dashboard start time
-NightlyStartTime: 00:00:00 EDT
-
-# Commands for the build/test/submit cycle
-ConfigureCommand: "/home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake" "/experiments/alice/cimag/o2/O2"
-MakeCommand: /home/cimag/.cache/JetBrains/RemoteDev/dist/b21c2b5ff1f19_CLion-2023.2.2/bin/cmake/linux/x64/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}"
-DefaultCTestConfigurationType: Release
-
-# version control
-UpdateVersionOnly: 
-
-# CVS options
-# Default is "-d -P -A"
-CVSCommand: 
-CVSUpdateOptions: 
-
-# Subversion options
-SVNCommand: 
-SVNOptions: 
-SVNUpdateOptions: 
-
-# Git options
-GITCommand: /usr/bin/git
-GITInitSubmodules: 
-GITUpdateOptions: 
-GITUpdateCustom: 
-
-# Perforce options
-P4Command: 
-P4Client: 
-P4Options: 
-P4UpdateOptions: 
-P4UpdateCustom: 
-
-# Generic update command
-UpdateCommand: /usr/bin/git
-UpdateOptions: 
-UpdateType: git
-
-# Compiler info
-Compiler: /usr/bin/c++
-CompilerVersion: 4.8.5
-
-# Dynamic analysis (MemCheck)
-PurifyCommand: 
-ValgrindCommand: 
-ValgrindCommandOptions: 
-DrMemoryCommand: 
-DrMemoryCommandOptions: 
-CudaSanitizerCommand: 
-CudaSanitizerCommandOptions: 
-MemoryCheckType: 
-MemoryCheckSanitizerOptions: 
-MemoryCheckCommand: /usr/bin/valgrind
-MemoryCheckCommandOptions: 
-MemoryCheckSuppressionFile: 
-
-# Coverage
-CoverageCommand: /usr/bin/gcov
-CoverageExtraFlags: -l
-
-# Testing options
-# TimeOut is the amount of time in seconds to wait for processes
-# to complete during testing.  After TimeOut seconds, the
-# process will be summarily terminated.
-# Currently set to 25 minutes
-TimeOut: 1500
-
-# During parallel testing CTest will not start a new test if doing
-# so would cause the system load to exceed this value.
-TestLoad: 
-
-UseLaunchers: 
-CurlOptions: 
-# warning, if you add new options here that have to do with submit,
-# you have to update cmCTestSubmitCommand.cxx
-
-# For CTest submissions that timeout, these options
-# specify behavior for retrying the submission
-CTestSubmitRetryDelay: 5
-CTestSubmitRetryCount: 3