diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf1d83fbb6644..feabbddfcf0f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,9 @@ project(O2 LANGUAGES C CXX VERSION 1.2.0)
 
 include(CTest)
 
+set(CUDA_COMPUTETARGET 75)
+set(GPUCA_NO_FAST_MATH 1)
+
 # Project wide setup
 
 # Would better fit inside GPU/CMakeLists.txt, but include GPU/Common directly
diff --git a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h
index 49d61007092a8..c5e103362f0f4 100644
--- a/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h
+++ b/DataFormats/Detectors/TPC/include/DataFormatsTPC/ClusterNative.h
@@ -156,6 +156,17 @@ struct ClusterNative {
       return (this->getFlags() < rhs.getFlags());
     }
   }
+
+  GPUd() bool operator==(const ClusterNative& rhs) const
+  {
+    return this->getTimePacked() == rhs.getTimePacked() &&
+           this->padPacked == rhs.padPacked &&
+           this->sigmaTimePacked == rhs.sigmaTimePacked &&
+           this->sigmaPadPacked == rhs.sigmaPadPacked &&
+           this->qMax == rhs.qMax &&
+           this->qTot == rhs.qTot &&
+           this->getFlags() == rhs.getFlags();
+  }
 };
 
 // This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to
diff --git a/GPU/CMakeLists.txt b/GPU/CMakeLists.txt
index 6e08e9582761a..a3232b454d5ba 100644
--- a/GPU/CMakeLists.txt
+++ b/GPU/CMakeLists.txt
@@ -18,8 +18,10 @@
 # HDRS_CINT_O2: Headers for ROOT dictionary (only for O2) HDRS_INSTALL: Headers
 # for installation only
 
+#set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
+
 if(NOT DEFINED GPUCA_NO_FAST_MATH)
-  set(GPUCA_NO_FAST_MATH 0)
+  set(GPUCA_NO_FAST_MATH 1)
 endif()
 if(${GPUCA_NO_FAST_MATH})
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-fast-math -ffp-contract=off")
diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h
index b9b93100d5bfd..d18e7863e2001 100644
--- a/GPU/GPUTracking/Base/GPUConstantMem.h
+++ b/GPU/GPUTracking/Base/GPUConstantMem.h
@@ -46,6 +46,7 @@ class GPUTRDTracker_t
 #if defined(GPUCA_NOCOMPAT_ALLCINT) && (!defined(GPUCA_GPULIBRARY) || !defined(GPUCA_ALIROOT_LIB)) && defined(GPUCA_HAVE_O2HEADERS)
 #include "GPUTPCConvert.h"
 #include "GPUTPCCompression.h"
+#include "GPUTPCDecompression.h"
 #include "GPUITSFitter.h"
 #include "GPUTPCClusterFinder.h"
 #include "GPUTrackingRefit.h"
@@ -69,6 +70,7 @@ struct GPUConstantMem {
   tpcTrackers[GPUCA_NSLICES];
   GPUTPCConvert tpcConverter;
   GPUTPCCompression tpcCompressor;
+  GPUTPCDecompression tpcDecompressor;
   GPUTPCGMMerger tpcMerger;
   GPUTRDTrackerGPU trdTrackerGPU;
 #ifdef GPUCA_HAVE_O2HEADERS
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 8b56e46888e9b..518f0af43aae4 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -511,7 +511,7 @@ inline void GPUReconstruction::RegisterGPUProcessor(T* proc, bool deviceSlave)
 template <class T>
 inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
 {
-  static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derrived class");
+  static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class");
   if (allocate) {
     proc->SetMaxData(mHostConstantMem->ioPtrs);
   }
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 5b176218984e5..97642734a60b7 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -39,6 +39,7 @@
 #include "GPUITSFitterKernels.h"
 #include "GPUTPCConvertKernel.h"
 #include "GPUTPCCompressionKernels.h"
+#include "GPUTPCDecompressionKernels.h"
 #include "GPUTPCClusterFinderKernels.h"
 #include "GPUTrackingRefitKernel.h"
 #include "GPUTPCGMO2Output.h"
diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
index 0297a1a6cb9d1..81fde1dbfb996 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesDevice.h
@@ -75,6 +75,9 @@ using namespace GPUCA_NAMESPACE::gpu;
 #include "GPUTPCCompressionKernels.cxx"
 #include "GPUTPCCompressionTrackModel.cxx"
 
+//Files for TPC Decompression
+#include "GPUTPCDecompressionKernels.cxx"
+
 // Files for TPC Cluster Finder
 #include "ClusterAccumulator.cxx"
 #include "GPUTPCCFStreamCompaction.cxx"
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 0811bb6d1ad79..9973720250ad8 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -11,7 +11,7 @@
 
 set(MODULE GPUTracking)
 
-# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -O0") # to uncomment if needed, tired of typing this...
+# set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}} -fno-omit-frame-pointer") # to uncomment if needed, tired of typing this...
 
 include(cmake/helpers.cmake)
 
@@ -176,6 +176,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
         DataCompression/GPUTPCCompression.cxx
         DataCompression/GPUTPCCompressionTrackModel.cxx
         DataCompression/GPUTPCCompressionKernels.cxx
+        DataCompression/GPUTPCDecompression.cxx
+        DataCompression/GPUTPCDecompressionKernels.cxx
         DataCompression/TPCClusterDecompressor.cxx
         DataCompression/GPUTPCClusterStatistics.cxx
         TPCClusterFinder/GPUTPCClusterFinder.cxx
@@ -469,6 +471,7 @@ endif()
 set_source_files_properties(DataCompression/GPUTPCCompressionTrackModel.cxx
                             DataCompression/GPUTPCCompressionKernels.cxx
                             DataCompression/TPCClusterDecompressor.cxx
+                            DataCompression/GPUTPCDecompressionKernels.cxx
                             TARGET_DIRECTORY ${targetName}
                             PROPERTIES COMPILE_FLAGS "-fno-fast-math -ffp-contract=off")
 
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
new file mode 100644
index 0000000000000..c70422a206bf2
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
@@ -0,0 +1,94 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompression.cxx
+/// \author Gabriele Cimador
+
+#include "GPUTPCDecompression.h"
+#include "GPUTPCCompression.h"
+#include "GPUReconstruction.h"
+#include "GPUO2DataTypes.h"
+#include "GPUMemorySizeScalers.h"
+#include "GPULogging.h"
+
+using namespace GPUCA_NAMESPACE::gpu;
+
+void GPUTPCDecompression::InitializeProcessor() {}
+
+void* GPUTPCDecompression::SetPointersInputGPU(void* mem)
+{
+  SetPointersCompressedClusters(mem, mInputGPU, mInputGPU.nAttachedClusters, mInputGPU.nTracks, mInputGPU.nUnattachedClusters, true);
+  return mem;
+}
+
+template <class T>
+void GPUTPCDecompression::SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA)
+{
+  computePointerWithAlignment(mem, c.qTotU, nClU); // Do not reorder, qTotU ist used as first address in GPUChainTracking::RunTPCCompression
+  computePointerWithAlignment(mem, c.qMaxU, nClU);
+  computePointerWithAlignment(mem, c.flagsU, nClU);
+  computePointerWithAlignment(mem, c.padDiffU, nClU);
+  computePointerWithAlignment(mem, c.timeDiffU, nClU);
+  computePointerWithAlignment(mem, c.sigmaPadU, nClU);
+  computePointerWithAlignment(mem, c.sigmaTimeU, nClU);
+  computePointerWithAlignment(mem, c.nSliceRowClusters, GPUCA_ROW_COUNT * NSLICES);
+
+  unsigned int nClAreduced = reducedClA ? nClA - nTr : nClA;
+
+  if (!(mRec->GetParam().rec.tpc.compressionTypeMask & GPUSettings::CompressionTrackModel)) {
+    return; // Track model disabled, do not allocate memory
+  }
+  computePointerWithAlignment(mem, c.qTotA, nClA);
+  computePointerWithAlignment(mem, c.qMaxA, nClA);
+  computePointerWithAlignment(mem, c.flagsA, nClA);
+  computePointerWithAlignment(mem, c.rowDiffA, nClAreduced);
+  computePointerWithAlignment(mem, c.sliceLegDiffA, nClAreduced);
+  computePointerWithAlignment(mem, c.padResA, nClAreduced);
+  computePointerWithAlignment(mem, c.timeResA, nClAreduced);
+  computePointerWithAlignment(mem, c.sigmaPadA, nClA);
+  computePointerWithAlignment(mem, c.sigmaTimeA, nClA);
+
+  computePointerWithAlignment(mem, c.qPtA, nTr);
+  computePointerWithAlignment(mem, c.rowA, nTr);
+  computePointerWithAlignment(mem, c.sliceA, nTr);
+  computePointerWithAlignment(mem, c.timeA, nTr);
+  computePointerWithAlignment(mem, c.padA, nTr);
+
+  computePointerWithAlignment(mem, c.nTrackClusters, nTr);
+}
+
+void* GPUTPCDecompression::SetPointersTmpNativeBuffersGPU(void* mem){
+  computePointerWithAlignment(mem,mTmpNativeClusters,NSLICES * GPUCA_ROW_COUNT * mMaxNativeClustersPerBuffer);
+  return mem;
+}
+
+void* GPUTPCDecompression::SetPointersTmpNativeBuffersOutput(void* mem){
+  computePointerWithAlignment(mem,mNativeClustersIndex,NSLICES * GPUCA_ROW_COUNT);
+  return mem;
+}
+
+void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem){
+  computePointerWithAlignment(mem,mUnattachedClustersOffsets,NSLICES * GPUCA_ROW_COUNT);
+  return mem;
+}
+
+void GPUTPCDecompression::RegisterMemoryAllocation() {
+  AllocateAndInitializeLate();
+  mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_CUSTOM, "TPCDecompressionInput");
+  mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersGPU,GPUMemoryResource::MEMORY_GPU,"TPCDecompressionTmpBuffersGPU");
+  mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersOutput,GPUMemoryResource::MEMORY_OUTPUT,"TPCDecompressionTmpBuffersOutput");
+  mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this,&GPUTPCDecompression::SetPointersTmpNativeBuffersInput,GPUMemoryResource::MEMORY_INPUT,"TPCDecompressionTmpBuffersInput");
+}
+
+void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io){
+  //mMaxNativeClustersPerBuffer = 81760;
+  mMaxNativeClustersPerBuffer = 12000;
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
new file mode 100644
index 0000000000000..d2cb749dd79c3
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
@@ -0,0 +1,79 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompression.h
+/// \author Gabriele Cimador
+
+#ifndef GPUTPCDECOMPRESSION_H
+#define GPUTPCDECOMPRESSION_H
+
+#include "GPUDef.h"
+#include "GPUProcessor.h"
+#include "GPUCommonMath.h"
+#include "GPUParam.h"
+#include "GPUO2DataTypes.h"
+
+#ifdef GPUCA_HAVE_O2HEADERS
+#include "DataFormatsTPC/CompressedClusters.h"
+#else
+namespace o2::tpc
+{
+struct CompressedClustersPtrs {
+};
+struct CompressedClusters {
+};
+struct CompressedClustersFlat {
+};
+} // namespace o2::tpc
+#endif
+
+namespace GPUCA_NAMESPACE::gpu
+{
+
+class GPUTPCDecompression : public GPUProcessor
+{
+  friend class GPUTPCDecompressionKernels;
+  friend class GPUChainTracking;
+
+ public:
+#ifndef GPUCA_GPUCODE
+  void InitializeProcessor();
+  void RegisterMemoryAllocation();
+  void SetMaxData(const GPUTrackingInOutPointers& io);
+
+  void* SetPointersInputGPU(void* mem);
+  void* SetPointersTmpNativeBuffersGPU(void* mem);
+  void* SetPointersTmpNativeBuffersOutput(void* mem);
+  void* SetPointersTmpNativeBuffersInput(void* mem);
+
+#endif
+
+ protected:
+  constexpr static unsigned int NSLICES = GPUCA_NSLICES;
+  o2::tpc::CompressedClusters mInputGPU;
+
+  unsigned int mMaxNativeClustersPerBuffer;
+  unsigned int* mNativeClustersIndex;
+  unsigned int* mUnattachedClustersOffsets;
+  o2::tpc::ClusterNative* mTmpNativeClusters;
+  o2::tpc::ClusterNativeAccess* mClusterNativeAccess;
+  o2::tpc::ClusterNative* mNativeClustersBuffer;
+
+  template <class T>
+  void SetPointersCompressedClusters(void*& mem, T& c, unsigned int nClA, unsigned int nTr, unsigned int nClU, bool reducedClA);
+
+  short mMemoryResInputGPU = -1;
+  short mResourceTmpIndexes = -1;
+  short mResourceTmpClustersOffsets = -1;
+
+};
+} // namespace GPUCA_NAMESPACE::gpu
+#endif // GPUTPCDECOMPRESSION_H
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
new file mode 100644
index 0000000000000..1a726d00ebcd2
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
@@ -0,0 +1,201 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompressionKernels.cxx
+/// \author Gabriele Cimador
+
+#include "GPUTPCDecompressionKernels.h"
+#include "GPULogging.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCCompressionTrackModel.h"
+#include "GPUCommonAlgorithm.h"
+#include <string.h>
+
+using namespace GPUCA_NAMESPACE::gpu;
+using namespace o2::tpc;
+
+template <>
+GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step0attached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
+  GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
+  CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
+  const GPUParam& GPUrestrict() param = processors.param;
+
+  unsigned int offset = 0, lasti = 0;
+  const unsigned int maxTime = (param.par.continuousMaxTimeBin + 1) * ClusterNative::scaleTimePacked - 1;
+
+  for (unsigned int i = get_global_id(0); i < cmprClusters.nTracks; i += get_global_size(0)) {
+    while (lasti < i) {
+      offset += cmprClusters.nTrackClusters[lasti++];
+    }
+    lasti++;
+    decompressTrack(cmprClusters, param, maxTime, i, offset, decompressor);
+  }
+}
+
+GPUdii() void GPUTPCDecompressionKernels::decompressTrack(CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor){
+  float zOffset = 0;
+  unsigned int slice = cmprClusters.sliceA[trackIndex];
+  unsigned int row = cmprClusters.rowA[trackIndex];
+  GPUTPCCompressionTrackModel track;
+  unsigned int clusterIndex;
+  for(clusterIndex = 0; clusterIndex < cmprClusters.nTrackClusters[trackIndex]; clusterIndex++){
+    unsigned int pad = 0, time = 0;
+    if(clusterIndex != 0){
+      unsigned char tmpSlice = cmprClusters.sliceLegDiffA[clusterOffset - trackIndex -1];
+      bool changeLeg = (tmpSlice >= GPUCA_NSLICES);
+      if(changeLeg){
+        tmpSlice -= GPUCA_NSLICES;
+      }
+      if(cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences){
+        slice += tmpSlice;
+        if(slice >= GPUCA_NSLICES){
+          slice -= GPUCA_NSLICES;
+        }
+        row += cmprClusters.rowDiffA[clusterOffset -trackIndex -1];
+        if(row >= GPUCA_ROW_COUNT){
+          row -= GPUCA_ROW_COUNT;
+        }
+      } else {
+        slice = tmpSlice;
+        row = cmprClusters.rowDiffA[clusterOffset -trackIndex -1];
+      }
+      if (changeLeg && track.Mirror()) {
+        break;
+      }
+      if (track.Propagate(param.tpcGeometry.Row2X(row),param.SliceParam[slice].Alpha)){
+        break;
+      }
+      unsigned int timeTmp = cmprClusters.timeResA[clusterOffset -trackIndex -1];
+      if (timeTmp & 800000) {
+        timeTmp |= 0xFF000000;
+      }
+      time = timeTmp + ClusterNative::packTime(CAMath::Max(0.f,param.tpcGeometry.LinearZ2Time(slice,track.Z() + zOffset)));
+      float tmpPad = CAMath::Max(0.f, CAMath::Min((float)param.tpcGeometry.NPads(GPUCA_ROW_COUNT - 1), param.tpcGeometry.LinearY2Pad(slice, row, track.Y())));
+      pad = cmprClusters.padResA[clusterOffset -trackIndex - 1] + ClusterNative::packPad(tmpPad);
+      time = time & 0xFFFFFF;
+      pad = (unsigned short)pad;
+      if (pad >= param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked) {
+        if (pad >= 0xFFFF - 11968) { // Constant 11968 = (2^15 - MAX_PADS(138) * scalePadPacked(64)) / 2
+          pad = 0;
+        } else {
+          pad = param.tpcGeometry.NPads(row) * ClusterNative::scalePadPacked - 1;
+        }
+      }
+      if (param.par.continuousMaxTimeBin > 0 && time >= maxTime) {
+        if (time >= 0xFFFFFF - 544768) { // Constant 544768 = (2^23 - LHCMAXBUNCHES(3564) * MAXORBITS(256) * scaleTimePacked(64) / BCPERTIMEBIN(8)) / 2)
+          time = 0;
+        } else {
+          time = maxTime;
+        }
+      }
+    } else {
+      time = cmprClusters.timeA[trackIndex];
+      pad = cmprClusters.padA[trackIndex];
+    }
+    bool stored;
+    const auto cluster = decompressTrackStore(cmprClusters, clusterOffset, slice, row, pad, time, decompressor, stored);
+    float y = param.tpcGeometry.LinearPad2Y(slice, row, cluster.getPad());
+    float z = param.tpcGeometry.LinearTime2Z(slice, cluster.getTime());
+    if(clusterIndex == 0){
+      zOffset = z;
+      track.Init(param.tpcGeometry.Row2X(row), y, z - zOffset, param.SliceParam[slice].Alpha, cmprClusters.qPtA[trackIndex],param);
+    }
+    if(clusterIndex + 1 < cmprClusters.nTrackClusters[trackIndex] && track.Filter(y,z-zOffset,row)){
+      break;
+    }
+    clusterOffset++;
+  }
+  clusterOffset += cmprClusters.nTrackClusters[trackIndex] - clusterIndex;
+}
+
+GPUdii() ClusterNative GPUTPCDecompressionKernels::decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored){
+  unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
+  unsigned int currentClusterIndex = CAMath::AtomicAdd(decompressor.mNativeClustersIndex + (slice * GPUCA_ROW_COUNT + row),1u);
+  const ClusterNative c(time, cmprClusters.flagsA[clusterOffset], pad, cmprClusters.sigmaTimeA[clusterOffset], cmprClusters.sigmaPadA[clusterOffset], cmprClusters.qMaxA[clusterOffset], cmprClusters.qTotA[clusterOffset]);
+  stored = currentClusterIndex < decompressor.mMaxNativeClustersPerBuffer;
+  if(stored){
+    decompressor.mTmpNativeClusters[tmpBufferIndex + currentClusterIndex] = c;
+  }
+  return c;
+}
+
+template <>
+GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::step1unattached>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
+  GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
+  CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
+  ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
+  const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
+
+  unsigned int* offsets = decompressor.mUnattachedClustersOffsets;
+  for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){
+    unsigned int slice = i / GPUCA_ROW_COUNT;
+    unsigned int row = i % GPUCA_ROW_COUNT;
+    unsigned int tmpBufferIndex = computeLinearTmpBufferIndex(slice,row,decompressor.mMaxNativeClustersPerBuffer);
+    ClusterNative* buffer = clusterBuffer + outputAccess->clusterOffset[slice][row];
+    if (decompressor.mNativeClustersIndex[i] != 0) {
+      memcpy((void*)buffer, (const void*)(decompressor.mTmpNativeClusters + tmpBufferIndex), decompressor.mNativeClustersIndex[i] * sizeof(clusterBuffer[0]));
+    }
+    ClusterNative* clout = buffer + decompressor.mNativeClustersIndex[i];
+    unsigned int end = offsets[i] + ((i >= decompressor.mInputGPU.nSliceRows) ? 0 : decompressor.mInputGPU.nSliceRowClusters[i]);
+    decompressHits(cmprClusters, offsets[i], end, clout);
+    if (processors.param.rec.tpc.clustersShiftTimebins != 0.f) {
+      for (unsigned int k = 0; k < outputAccess->nClusters[slice][row]; k++) {
+        auto& cl = buffer[k];
+        float t = cl.getTime() + processors.param.rec.tpc.clustersShiftTimebins;
+        if (t < 0) {
+          t = 0;
+        }
+        if (processors.param.par.continuousMaxTimeBin > 0 && t > processors.param.par.continuousMaxTimeBin) {
+          t = processors.param.par.continuousMaxTimeBin;
+        }
+        cl.setTime(t);
+      }
+    }
+    GPUCommonAlgorithm::sort(buffer, buffer + outputAccess->nClusters[slice][row]);
+  }
+
+}
+
+GPUdii() void GPUTPCDecompressionKernels::decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, ClusterNative* clusterNativeBuffer){
+  unsigned int time = 0;
+  unsigned short pad = 0;
+  for (unsigned int k = start; k < end; k++) {
+    if (cmprClusters.nComppressionModes & GPUSettings::CompressionDifferences) {
+      unsigned int timeTmp = cmprClusters.timeDiffU[k];
+      if (timeTmp & 800000) {
+        timeTmp |= 0xFF000000;
+      }
+      time += timeTmp;
+      pad += cmprClusters.padDiffU[k];
+    } else {
+      time = cmprClusters.timeDiffU[k];
+      pad = cmprClusters.padDiffU[k];
+    }
+   *(clusterNativeBuffer++) = ClusterNative(time, cmprClusters.flagsU[k], pad, cmprClusters.sigmaTimeU[k], cmprClusters.sigmaPadU[k], cmprClusters.qMaxU[k], cmprClusters.qTotU[k]);
+  }
+}
+
+/*
+template <>
+GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::prepareAccess>(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors){
+  for (unsigned int i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)){
+    GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
+    CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
+    o2::tpc::ClusterNativeAccess& clustersNative = *decompressor.mClusterNativeAccess;
+    unsigned int slice = i / GPUCA_ROW_COUNT;
+    unsigned int row = i % GPUCA_ROW_COUNT;
+    unsigned int unattachedOffset = (i >= cmprClusters.nSliceRows) ? 0 : cmprClusters.nSliceRowClusters[i];
+    (clustersNative.nClusters)[slice][row] = decompressor.mNativeClustersIndex[i] + unattachedOffset;
+    for(unsigned int j = i+1; j < GPUCA_NSLICES * GPUCA_ROW_COUNT; j++){
+      CAMath::AtomicAdd(decompressor.mUnattachedClustersOffsets + j,unattachedOffset);
+    }
+  }
+}*/
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
new file mode 100644
index 0000000000000..82041d05fcb7c
--- /dev/null
+++ b/GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h
@@ -0,0 +1,59 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCDecompressionKernels.h
+/// \author Gabriele Cimador
+
+#ifndef GPUTPCDECOMPRESSIONKERNELS_H
+#define GPUTPCDECOMPRESSIONKERNELS_H
+
+#include "GPUGeneralKernels.h"
+#include "GPUO2DataTypes.h"
+#include "GPUParam.h"
+#include "GPUConstantMem.h"
+
+#ifdef GPUCA_HAVE_O2HEADERS
+#include "DataFormatsTPC/CompressedClusters.h"
+#else
+namespace o2::tpc
+{
+struct CompressedClusters {
+};
+} // namespace o2::tpc
+#endif
+
+namespace GPUCA_NAMESPACE::gpu
+{
+
+class GPUTPCDecompressionKernels : public GPUKernelTemplate
+{
+ public:
+  GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep() { return GPUDataTypes::RecoStep::TPCDecompression; }
+
+  enum K : int {
+    step0attached = 0,
+    step1unattached = 1,
+    //prepareAccess = 2
+  };
+
+  template <int iKernel = defaultKernel>
+  GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors);
+  GPUd() static void decompressTrack(o2::tpc::CompressedClusters& cmprClusters, const GPUParam& param, const unsigned int maxTime, const unsigned int trackIndex, unsigned int& clusterOffset, GPUTPCDecompression& decompressor);
+  GPUdi() static o2::tpc::ClusterNative decompressTrackStore(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int clusterOffset, unsigned int slice, unsigned int row, unsigned int pad, unsigned int time, GPUTPCDecompression& decompressor, bool& stored);
+  GPUdi() static void decompressHits(const o2::tpc::CompressedClusters& cmprClusters, const unsigned int start, const unsigned int end, o2::tpc::ClusterNative* clusterNativeBuffer);
+
+  GPUd() static unsigned int computeLinearTmpBufferIndex(unsigned int slice, unsigned int row, unsigned int maxClustersPerBuffer){
+    return slice * (GPUCA_ROW_COUNT * maxClustersPerBuffer) + row * maxClustersPerBuffer;
+  }
+};
+
+}
+#endif // GPUTPCDECOMPRESSIONKERNELS_H
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 07f9035329847..395bdec043331 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -71,6 +71,8 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -135,6 +137,8 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 192, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 192, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 512
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 512
@@ -198,7 +202,9 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 64, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 64, 2
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 3
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 3
   #define GPUCA_LB_GPUTPCCFCheckPadBaseline 64,8
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap 448
   #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits 448
@@ -262,7 +268,9 @@
   #define GPUCA_LB_GPUTPCGMMergerFinalize_1 256
   #define GPUCA_LB_GPUTPCGMMergerFinalize_2 256
   #define GPUCA_LB_GPUTPCCompressionKernels_step0attached 128
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 128
   #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 512, 2
+  #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 512, 2
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -322,6 +330,12 @@
   #ifndef GPUCA_LB_GPUTPCCompressionKernels_step1unattached
     #define GPUCA_LB_GPUTPCCompressionKernels_step1unattached 256
   #endif
+  #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step0attached
+    #define GPUCA_LB_GPUTPCDecompressionKernels_step0attached 256
+  #endif
+  #ifndef GPUCA_LB_GPUTPCDecompressionKernels_step1unattached
+    #define GPUCA_LB_GPUTPCDecompressionKernels_step1unattached 256
+  #endif
   #ifndef GPUCA_LB_GPUTPCCFDecodeZS
     #define GPUCA_LB_GPUTPCCFDecodeZS 128, 4
   #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 403aa0dae2800..1a0aba3533b4f 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -105,6 +105,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors()
   if (GetRecoSteps() & RecoStep::TPCCompression) {
     mRec->RegisterGPUProcessor(&processors()->tpcCompressor, GetRecoStepsGPU() & RecoStep::TPCCompression);
   }
+  if (GetRecoSteps() & RecoStep::TPCDecompression) {
+    mRec->RegisterGPUProcessor(&processors()->tpcDecompressor, GetRecoStepsGPU() & RecoStep::TPCDecompression);
+  }
   if (GetRecoSteps() & RecoStep::TPCClusterFinding) {
     for (unsigned int i = 0; i < NSLICES; i++) {
       mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
@@ -149,6 +152,9 @@ void GPUChainTracking::RegisterGPUProcessors()
   if (GetRecoStepsGPU() & RecoStep::TPCCompression) {
     mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcCompressor, &processors()->tpcCompressor);
   }
+  if (GetRecoStepsGPU() & RecoStep::TPCDecompression) {
+    mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcDecompressor, &processors()->tpcDecompressor);
+  }
   if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) {
     for (unsigned int i = 0; i < NSLICES; i++) {
       mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 3c622751acad1..e0a4ba3ccd8a2 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -44,7 +44,6 @@ int GPUChainTracking::RunTPCCompression()
   }
   SetupGPUProcessor(&Compressor, true);
   new (Compressor.mMemory) GPUTPCCompression::memory;
-
   WriteToConstantMemory(myStep, (char*)&processors()->tpcCompressor - (char*)processors(), &CompressorShadow, sizeof(CompressorShadow), 0);
   TransferMemoryResourcesToGPU(myStep, &Compressor, 0);
   runKernel<GPUMemClean16>(GetGridAutoStep(0, RecoStep::TPCCompression), krnlRunRangeNone, krnlEventNone, CompressorShadow.mClusterStatus, Compressor.mMaxClusters * sizeof(CompressorShadow.mClusterStatus[0]));
@@ -84,7 +83,6 @@ int GPUChainTracking::RunTPCCompression()
     SynchronizeStream(OutputStream()); // Synchronize output copies running in parallel from memory that might be released, only the following async copy from stacked memory is safe after the chain finishes.
     outputStream = OutputStream();
   }
-
   if (ProcessingSettings().tpcCompressionGatherMode >= 2) {
     if (ProcessingSettings().tpcCompressionGatherMode == 2) {
       void* devicePtr = mRec->getGPUPointer(Compressor.mOutputFlat);
@@ -208,11 +206,125 @@ int GPUChainTracking::RunTPCCompression()
 int GPUChainTracking::RunTPCDecompression()
 {
 #ifdef GPUCA_HAVE_O2HEADERS
+ // mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
+  ClusterNativeAccess* original = new ClusterNativeAccess;
+  original->clustersLinear = new ClusterNative[mIOPtrs.clustersNative->nClustersTotal];
+  memcpy((void*)original->clustersLinear,mIOPtrs.clustersNative->clustersLinear,mIOPtrs.clustersNative->nClustersTotal*sizeof(mIOPtrs.clustersNative->clustersLinear[0]));
+  memcpy((void*)original->nClusters,mIOPtrs.clustersNative->nClusters,NSLICES*GPUCA_ROW_COUNT*sizeof(mIOPtrs.clustersNative->nClusters[0][0]));
+  original->setOffsetPtrs();
+
+  RecoStep myStep = RecoStep::TPCDecompression;
+  bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression; // with -g gives true
+  GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
+  GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
   const auto& threadContext = GetThreadContext();
+
+  CompressedClusters cmprClsHost = *mIOPtrs.tpcCompressedClusters;
+  CompressedClusters& inputGPU = Decompressor.mInputGPU;
+  inputGPU.nAttachedClusters = cmprClsHost.nAttachedClusters;
+  inputGPU.nUnattachedClusters = cmprClsHost.nUnattachedClusters;
+  inputGPU.nTracks = cmprClsHost.nTracks;
+  inputGPU.nAttachedClustersReduced = inputGPU.nAttachedClusters - inputGPU.nTracks;
+  inputGPU.nSliceRows = NSLICES * GPUCA_ROW_COUNT;
+  inputGPU.nComppressionModes = param().rec.tpc.compressionTypeMask;
+  inputGPU.solenoidBz = param().bzkG;
+  inputGPU.maxTimeBin = param().par.continuousMaxTimeBin;
+  SetupGPUProcessor(&Decompressor, true);
+
+  size_t copySize = AllocateRegisteredMemory(Decompressor.mMemoryResInputGPU);
+  WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
+
+  int outputStream = 0;
+  bool toGPU = true;
+  CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
+  SynchronizeStream(outputStream);
+
+  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.nTrackClusters, cmprClsHost.nTrackClusters, cmprClsHost.nTracks * sizeof(cmprClsHost.nTrackClusters[0]), outputStream, toGPU);
+
+  GPUMemCpyAlways(myStep, inputGPUShadow.qTotU, cmprClsHost.qTotU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qTotU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.qMaxU, cmprClsHost.qMaxU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.qMaxU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.flagsU, cmprClsHost.flagsU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.flagsU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.padDiffU, cmprClsHost.padDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.padDiffU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.timeDiffU, cmprClsHost.timeDiffU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.timeDiffU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.nSliceRowClusters, cmprClsHost.nSliceRowClusters, NSLICES * GPUCA_ROW_COUNT * sizeof(cmprClsHost.nSliceRowClusters[0]), outputStream, toGPU);
+
+  GPUMemCpyAlways(myStep, inputGPUShadow.qTotA, cmprClsHost.qTotA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qTotA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.qMaxA, cmprClsHost.qMaxA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.qMaxA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.flagsA, cmprClsHost.flagsA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.flagsA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.rowDiffA, cmprClsHost.rowDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.rowDiffA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sliceLegDiffA, cmprClsHost.sliceLegDiffA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.sliceLegDiffA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.padResA, cmprClsHost.padResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.padResA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.timeResA, cmprClsHost.timeResA, (cmprClsHost.nAttachedClusters - cmprClsHost.nTracks) * sizeof(cmprClsHost.timeResA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaPadA, cmprClsHost.sigmaPadA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaPadA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sigmaTimeA, cmprClsHost.sigmaTimeA, cmprClsHost.nAttachedClusters * sizeof(cmprClsHost.sigmaTimeA[0]), outputStream, toGPU);
+
+  GPUMemCpyAlways(myStep, inputGPUShadow.qPtA, cmprClsHost.qPtA, cmprClsHost.nTracks * sizeof(cmprClsHost.qPtA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.rowA, cmprClsHost.rowA, cmprClsHost.nTracks * sizeof(cmprClsHost.rowA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.sliceA, cmprClsHost.sliceA, cmprClsHost.nTracks * sizeof(cmprClsHost.sliceA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.timeA, cmprClsHost.timeA, cmprClsHost.nTracks * sizeof(cmprClsHost.timeA[0]), outputStream, toGPU);
+  GPUMemCpyAlways(myStep, inputGPUShadow.padA, cmprClsHost.padA, cmprClsHost.nTracks * sizeof(cmprClsHost.padA[0]), outputStream, toGPU);
+
+  runKernel<GPUMemClean16>(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone, DecompressorShadow.mNativeClustersIndex, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNativeClustersIndex[0]));
+  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step0attached>(GetGridAuto(0), krnlRunRangeNone, krnlEventNone);
+
+  mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
+
+  AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput);
+  AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
+  TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, 0);
+  SynchronizeStream(0);
+
+  unsigned int offset = 0;
+  unsigned int decodedAttachedClusters = 0;
+  for (unsigned int i = 0; i < NSLICES; i++) {
+    for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
+      unsigned int linearIndex = i * GPUCA_ROW_COUNT + j;
+      unsigned int unattachedOffset = (linearIndex >= cmprClsHost.nSliceRows) ? 0 : cmprClsHost.nSliceRowClusters[linearIndex];
+      (mClusterNativeAccess->nClusters)[i][j] = Decompressor.mNativeClustersIndex[linearIndex] + unattachedOffset;
+      Decompressor.mUnattachedClustersOffsets[linearIndex] = offset;
+      offset += unattachedOffset;
+      decodedAttachedClusters += Decompressor.mNativeClustersIndex[linearIndex];
+    }
+  }
+  LOGP(info,"decoded = {}",decodedAttachedClusters);
+  TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression,Decompressor.mResourceTmpClustersOffsets,0);
+  if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
+         GPUWarning("My version: %u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
+  } else {
+         GPUInfo("My version: all attached clusters have been decoded");
+  }
+
+  Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
+  DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
+  mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
+  mClusterNativeAccess->setOffsetPtrs();
+  mIOPtrs.clustersNative = mClusterNativeAccess.get();
+  *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
+  processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
+  WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
+  WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), 0);
+  TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0);
+  mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
+  mClusterNativeAccess->setOffsetPtrs();
+  *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
+  processors()->ioPtrs.clustersNative = mInputsHost->mPclusterNativeAccess;
+
+  runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>(GetGridAutoStep(0, RecoStep::TPCDecompression), krnlRunRangeNone, krnlEventNone);
+
+  ClusterNative* tmpBuffer = new ClusterNative[mInputsHost->mNClusterNative];
+  ClusterNativeAccess gpuBuffer = *mInputsHost->mPclusterNativeAccess;
+  gpuBuffer.clustersLinear = tmpBuffer;
+  //GPUMemCpy(RecoStep::TPCDecompression,mInputsHost->mPclusterNativeOutput,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false);
+  GPUMemCpy(RecoStep::TPCDecompression,tmpBuffer,mInputsShadow->mPclusterNativeBuffer, sizeof(mInputsShadow->mPclusterNativeBuffer[0]) * mIOPtrs.clustersNative->nClustersTotal,0,false);
+  gpuBuffer.setOffsetPtrs();
+
   TPCClusterDecompressor decomp;
   auto allocator = [this](size_t size) {
     this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;
-    this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
+    //this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
     return this->mInputsHost->mPclusterNativeOutput;
   };
   auto& gatherTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
@@ -224,7 +336,7 @@ int GPUChainTracking::RunTPCDecompression()
   gatherTimer.Stop();
   mIOPtrs.clustersNative = mClusterNativeAccess.get();
   if (mRec->IsGPU()) {
-    AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
+    //AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
     processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
     WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
     *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
@@ -234,6 +346,50 @@ int GPUChainTracking::RunTPCDecompression()
     TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, 0);
     SynchronizeStream(0);
   }
+
+  const ClusterNativeAccess* decoded = &gpuBuffer; //mIOPtrs.clustersNative;
+ // original = (ClusterNativeAccess*)mIOPtrs.clustersNative;
+  unsigned int decodingErrors = 0;
+  std::vector<o2::tpc::ClusterNative> tmpClusters;
+  if (param().rec.tpc.rejectionStrategy == GPUSettings::RejectionNone) { // verification does not make sense if we reject clusters during compression
+    for (unsigned int i = 0; i < NSLICES; i++) {
+      for (unsigned int j = 0; j < GPUCA_ROW_COUNT; j++) {
+        if (original->nClusters[i][j] != decoded->nClusters[i][j]) {
+          GPUError("Number of clusters mismatch slice %u row %u: expected %d v.s. decoded %d", i, j, original->nClusters[i][j], decoded->nClusters[i][j]);
+          decodingErrors++;
+          continue;
+        }
+        tmpClusters.resize(original->nClusters[i][j]);
+        for (unsigned int k = 0; k < original->nClusters[i][j]; k++) {
+          tmpClusters[k] = original->clusters[i][j][k];
+          if (param().rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
+            GPUTPCCompression::truncateSignificantBitsChargeMax(tmpClusters[k].qMax, param());
+            GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param());
+            GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaPadPacked, param());
+            GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param());
+          }
+        }
+        std::sort(tmpClusters.begin(), tmpClusters.end());
+        for (unsigned int k = 0; k < original->nClusters[i][j]; k++) {
+          const o2::tpc::ClusterNative& c1 = tmpClusters[k];
+          const o2::tpc::ClusterNative& c2 = decoded->clusters[i][j][k];
+          if (!(c1 == c2)) {
+            if (decodingErrors++ < 100) {
+              //GPUWarning("Cluster mismatch: slice %2u row %3u hit %5u: %6d %3d %4d %3d %3d %4d %4d", i, j, k, (int)c1.getTimePacked(), (int)c1.getFlags(), (int)c1.padPacked, (int)c1.sigmaTimePacked, (int)c1.sigmaPadPacked, (int)c1.qMax, (int)c1.qTot);
+              //GPUWarning("%45s %6d %3d %4d %3d %3d %4d %4d", "", (int)c2.getTimePacked(), (int)c2.getFlags(), (int)c2.padPacked, (int)c2.sigmaTimePacked, (int)c2.sigmaPadPacked, (int)c2.qMax, (int)c2.qTot);
+            }
+          }
+        }
+      }
+    }
+    if (decodingErrors) {
+      GPUWarning("Errors during cluster decoding %u\n", decodingErrors);
+    } else {
+      GPUInfo("Cluster decoding verification: PASSED");
+    }
+  }
+  delete[] tmpBuffer;
+  // mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCDCMPR"));
 #endif
   return 0;
-}
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index 8df8094fe44d3..5805bc386ab2e 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -667,7 +667,7 @@ int RunBenchmark(GPUReconstruction* recUse, GPUChainTracking* chainTrackingUse,
         chainTrackingAsync->mIOPtrs.rawClusters[i] = nullptr;
         chainTrackingAsync->mIOPtrs.nRawClusters[i] = 0;
       }
-      chainTrackingAsync->mIOPtrs.clustersNative = nullptr;
+      chainTrackingAsync->mIOPtrs.clustersNative = chainTrackingUse->mIOPtrs.clustersNative; //todo: revert back to nullptr
       recAsync->SetResetTimers(iRun < configStandalone.runsInit);
       tmpRetVal = recAsync->RunChains();
       if (tmpRetVal == 0 || tmpRetVal == 2) {
@@ -948,6 +948,7 @@ int main(int argc, char** argv)
       printf("Error unregistering memory\n");
     }
   }
+  //exit(0);
   rec->Exit();
 
   if (!configStandalone.noprompt) {
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index b06749010d2d9..d7302f17271d2 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -73,6 +73,8 @@ o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered32"   LB      simple)
 o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64"   LB      simple)
 o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128"  LB      simple)
 o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock"   LB      simple)
+o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached"    LB      simple)
+o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached"  LB      simple)
 o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline"                     LB      single)
 o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap"        LB      single)
 o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits"      LB      single)