diff --git a/.gitignore b/.gitignore
index 451461d..94ff03d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ imgui.ini
 *.cpp
 *.c
 .*
+include/
diff --git a/README.md b/README.md
index a86c101..b5ea39f 100644
--- a/README.md
+++ b/README.md
@@ -2,39 +2,58 @@
 
 Python wrapper for the OptiX 7 raytracing engine.
 
-Python-OptiX wraps the OptiX C++ API using Cython and provides a simplified 
-interface to the original C-like API using mainly the 
-[CuPy](https://cupy.dev) package.
+Python-OptiX wraps the original OptiX C-like API using Cython while aiming to provide a more
+pythonic, object-oriented interface using the [CuPy](https://cupy.dev) package.
 
 ### Supported Platforms
 
-Only Linux is supported at the moment.
+Only Linux is officially supported at the moment. Experimental windows support is available.
 
 ### OptiX Versions
 
-Python-OptiX currently supports the OptiX releases 7.3.0 and 7.4.0
+Python-OptiX always supports the most recent version of the OptiX SDK. 
+The current version therefore supports OptiX 7.6.0
 
 ## Installation
 
 ### Dependencies
 
 Install a recent version of the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads)
-and the [OptiX 7.4.0 SDK](https://developer.nvidia.com/optix/downloads/7.4.0/linux64-x86_64)
+and the [OptiX 7.6.0 SDK](https://developer.nvidia.com/optix/downloads/7.6.0/linux64-x86_64)
 
-Note: The older [OptiX 7.3.0 SDK](https://developer.nvidia.com/optix/downloads/7.4.0/linux64-x86_64) version is supported as well.
+Make sure the CUDA header files are installed as well. 
 
-Make sure the CUDA header files are installed as well.
+Note, that for some variants of the CUDA Toolkit, 
+like the one installed by the `conda` package manager, these are not installed by default. 
+`conda`-environments require the additional `cudatoolkit-dev` package.
 
-Add the locations of CUDA and OptiX to the system `PATH` variable if necessary.
+### Environment
+
+`python-optix` requires both the OptiX as well as the CUDA include path during setup as well as runtime 
+to compile the CUDA kernels. Therefore, it is necessary to either add both locations to the system `PATH`
+or set the `CUDA_PATH` and `OPTIX_PATH` variables to the respective locations.
+
+The setup additionally has the option to embed the OptiX header files into the `python-optix` installation. 
+If the variable `OPTIX_EMBED_HEADERS` is set to `1`, the setup will copy the headers from the 
+OptiX SDK directory into the generated wheel.
+
+If this option was chosen during setup, setting the `OPTIX_PATH` is no longer required as the 
+embedded headers will be utilized then.
 
 ### Using pip
 ```
-pip install python-optix
+export OPTIX_PATH=/path/to/optix
+export CUDA_PATH=/path/to/cuda_toolkit
+export OPTIX_EMBED_HEADERS=1 # embed the optix headers into the package
+python -m pip install python-optix
 ```
 
 ### From source
 ```
 git clone https://github.com/mortacious/python-optix.git
 cd python-optix
-python setup.py install
+export OPTIX_PATH=/path/to/optix
+export CUDA_PATH=/path/to/cuda_toolkit
+export OPTIX_EMBED_HEADERS=1 # embed the optix headers into the package
+python -m pip install [-e] .
 ```
diff --git a/examples/compile_with_tasks.py b/examples/compile_with_tasks.py
index 9e967ff..f3b2a35 100644
--- a/examples/compile_with_tasks.py
+++ b/examples/compile_with_tasks.py
@@ -72,4 +72,4 @@
         tic = time.time()
         for i in range(args.num_iters):
             module = ox.Module(ctx, ptx, module_compile_options=compile_opts, pipeline_compile_options=pipeline_options)
-        print("Overall run time without tasks", time.time()-tic)
\ No newline at end of file
+        print("Overall run time without tasks", time.time()-tic)
diff --git a/examples/cuda/helpers.h b/examples/cuda/helpers.h
index 046cb47..af0995c 100644
--- a/examples/cuda/helpers.h
+++ b/examples/cuda/helpers.h
@@ -42,11 +42,6 @@ __forceinline__ __device__ float3 toSRGB( const float3& c )
         c.z < 0.0031308f ? 12.92f * c.z : 1.055f * powed.z - 0.055f );
 }
 
-//__forceinline__ __device__ float dequantizeUnsigned8Bits( const unsigned char i )
-//{
-//    enum { N = (1 << 8) - 1 };
-//    return min((float)i / (float)N), 1.f)
-//}
 __forceinline__ __device__ unsigned char quantizeUnsigned8Bits( float x )
 {
     x = clamp( x, 0.0f, 1.0f );
@@ -60,6 +55,7 @@ __forceinline__ __device__ uchar4 make_color( const float3& c )
     float3 srgb = toSRGB( clamp( c, 0.0f, 1.0f ) );
     return make_uchar4( quantizeUnsigned8Bits( srgb.x ), quantizeUnsigned8Bits( srgb.y ), quantizeUnsigned8Bits( srgb.z ), 255u );
 }
+
 __forceinline__ __device__ uchar4 make_color( const float4& c )
 {
     return make_color( make_float3( c.x, c.y, c.z ) );
diff --git a/examples/cuda/opacity_micromap.cu b/examples/cuda/opacity_micromap.cu
new file mode 100644
index 0000000..d523bfe
--- /dev/null
+++ b/examples/cuda/opacity_micromap.cu
@@ -0,0 +1,143 @@
+//
+// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include <optix.h>
+
+#include "opacity_micromap.h"
+#include "helpers.h"
+
+#include "vec_math.h"
+
+extern "C" {
+__constant__ Params params;
+}
+
+
+static __forceinline__ __device__ void setPayloadColor( float3 p )
+{
+    optixSetPayload_0( __float_as_uint( p.x ) );
+    optixSetPayload_1( __float_as_uint( p.y ) );
+    optixSetPayload_2( __float_as_uint( p.z ) );
+}
+
+static __forceinline__ __device__ void setPayloadAnyhit( unsigned int a )
+{
+    optixSetPayload_3( a );
+}
+
+
+static __forceinline__ __device__ void computeRay( uint3 idx, uint3 dim, float3& origin, float3& direction )
+{
+    const float3 U = params.cam_u;
+    const float3 V = params.cam_v;
+    const float3 W = params.cam_w;
+    const float2 d = 2.0f * make_float2(
+            static_cast<float>( idx.x ) / static_cast<float>( dim.x ),
+            static_cast<float>( idx.y ) / static_cast<float>( dim.y )
+            ) - 1.0f;
+
+    origin    = params.cam_eye;
+    direction = normalize( d.x * U + d.y * V + W );
+}
+
+
+extern "C" __global__ void __raygen__rg()
+{
+    // Lookup our location within the launch grid
+    const uint3 idx = optixGetLaunchIndex();
+    const uint3 dim = optixGetLaunchDimensions();
+
+    // Map our launch idx to a screen location and create a ray from the camera
+    // location through the screen
+    float3 ray_origin, ray_direction;
+    computeRay( idx, dim, ray_origin, ray_direction );
+
+    // Trace the ray against our scene hierarchy
+    unsigned int p0, p1, p2, p3=0;
+    optixTrace(
+            params.handle,
+            ray_origin,
+            ray_direction,
+            0.0f,                // Min intersection distance
+            1e16f,               // Max intersection distance
+            0.0f,                // rayTime -- used for motion blur
+            OptixVisibilityMask( 255 ), // Specify always visible
+            OPTIX_RAY_FLAG_NONE,
+            0,                   // SBT offset   -- See SBT discussion
+            1,                   // SBT stride   -- See SBT discussion
+            0,                   // missSBTIndex -- See SBT discussion
+            p0, p1, p2, p3 );
+    float3 result;
+    result.x = __uint_as_float( p0 );
+    result.y = __uint_as_float( p1 );
+    result.z = __uint_as_float( p2 );
+    unsigned int anyhit_executed = p3;
+
+    // If anyhit was executed, tint the pixel towards white
+    if( anyhit_executed )
+        result = lerp( result, make_float3( 1.0f), 0.075f );
+
+    // Record results in our output raster
+    params.image[idx.y * params.image_width + idx.x] = make_color( result );
+}
+
+
+extern "C" __global__ void __miss__ms()
+{
+    MissData* miss_data  = reinterpret_cast<MissData*>( optixGetSbtDataPointer() );
+    setPayloadColor(  miss_data->bg_color );
+}
+
+
+extern "C" __global__ void __closesthit__ch()
+{
+    // When built-in triangle intersection is used, a number of fundamental
+    // attributes are provided by the OptiX API, including barycentric coordinates.
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+
+    setPayloadColor( make_float3( barycentrics*0.5f, 0.5f ) );
+}
+
+
+extern "C" __global__ void __anyhit__opacity()
+{
+    setPayloadAnyhit( 1u ); // Register that anyhit was invoked
+
+    const HitGroupData* rt_data      = reinterpret_cast<HitGroupData*>( optixGetSbtDataPointer() );
+    const float2        barycentrics = optixGetTriangleBarycentrics();
+    const int           prim_idx     = optixGetPrimitiveIndex();
+
+    const float2 uv0 = rt_data->uvs[ prim_idx*3 + 0 ];
+    const float2 uv1 = rt_data->uvs[ prim_idx*3 + 1 ];
+    const float2 uv2 = rt_data->uvs[ prim_idx*3 + 2 ];
+    //printf("AH: uv0: (%f, %f) uv1: (%f, %f) uv2: (%f, %f)\n", uv0.x, uv0.y, uv1.x, uv1.y, uv2.x, uv2.y);
+    const float2 uv = computeUV( barycentrics, uv0, uv1, uv2 );
+
+    if( inCircle( uv ) )
+        optixIgnoreIntersection();
+}
diff --git a/examples/cuda/opacity_micromap.h b/examples/cuda/opacity_micromap.h
new file mode 100644
index 0000000..290048e
--- /dev/null
+++ b/examples/cuda/opacity_micromap.h
@@ -0,0 +1,82 @@
+//
+// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+#include "vec_math.h"
+
+constexpr float CIRCLE_RADIUS = 0.75f;
+
+//-----------------------------------------------------------------------------
+//
+// Helper functions to be used when pre-baking opacity into OMM and when
+// evaluating opacity within anyhit function
+//
+//-----------------------------------------------------------------------------
+
+static __host__ __device__ __inline__ float2 computeUV( float2 bary, float2 uv0, float2 uv1, float2 uv2 )
+{
+    return ( 1.0f - bary.x - bary.y )*uv0 + bary.x*uv1 + bary.y*uv2;
+}
+
+static __host__ __device__ __inline__ bool inCircle( const float2 uv )
+{
+    return ( uv.x * uv.x + uv.y * uv.y ) < ( CIRCLE_RADIUS * CIRCLE_RADIUS );
+};
+
+
+//-----------------------------------------------------------------------------
+//
+// Types
+//
+//-----------------------------------------------------------------------------
+struct Params
+{
+    uchar4*                image;
+    unsigned int           image_width;
+    unsigned int           image_height;
+    float3                 cam_eye;
+    float3                 cam_u, cam_v, cam_w;
+    OptixTraversableHandle handle;
+};
+
+
+struct RayGenData
+{
+    // No data needed
+};
+
+
+struct MissData
+{
+    float3 bg_color;
+};
+
+
+struct HitGroupData
+{
+    float2* uvs;
+};
diff --git a/examples/denolser.py b/examples/denoiser.py
similarity index 96%
rename from examples/denolser.py
rename to examples/denoiser.py
index 6e3d30f..2fcf1f0 100644
--- a/examples/denolser.py
+++ b/examples/denoiser.py
@@ -59,10 +59,10 @@
     ret = cp.asnumpy(ret)
 
     fig, axs = plt.subplots(nrows=2, sharex=True, sharey=True)
-    axs[0].imshow(np.clip(color_image, 0, 255).astype(np.uint8))
+    axs[0].imshow(np.clip(color_image, 0, 1).astype(np.float32))
     axs[0].set_title("original")
 
-    ret = np.clip(ret, 0, 255).astype(np.uint8)
+    ret = np.clip(ret, 0, 1).astype(np.float32)
     axs[1].imshow(ret)
     axs[1].set_title("denoised")
     plt.show()
\ No newline at end of file
diff --git a/examples/dynamic_geometry.py b/examples/dynamic_geometry.py
index 49f94df..f9344ae 100644
--- a/examples/dynamic_geometry.py
+++ b/examples/dynamic_geometry.py
@@ -18,6 +18,16 @@
 
 DEBUG=False
 
+if DEBUG:
+    exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW,
+    debug_level = ox.CompileDebugLevel.FULL
+    opt_level = ox.CompileOptimizationLevel.LEVEL_0
+else:
+    exception_flags=ox.ExceptionFlags.NONE
+    debug_level = ox.CompileDebugLevel.MINIMAL
+    opt_level = ox.CompileOptimizationLevel.LEVEL_3
+
+
 #------------------------------------------------------------------------------
 # Local types
 #------------------------------------------------------------------------------
@@ -41,7 +51,9 @@ def __init__(self):
 
     def __getattribute__(self, name):
         if name in Params._params.keys():
-            return self.__dict__['handle'][name]
+            item = self.__dict__['handle'][name]
+            if isinstance(item, np.ndarray) and item.shape in ((0,), (1,)):
+                return item.item()
         else:
             return super().__getattribute__(name)
 
@@ -336,15 +348,9 @@ def build_mesh_accel(state):
 
 
 def create_module(state):
-    if DEBUG:
-        exception_flags=ox.ExceptionFlags.DEBUG | ox.ExceptionFlags.TRACE_DEPTH | ox.ExceptionFlags.STACK_OVERFLOW,
-    else:
-        exception_flags=ox.ExceptionFlags.NONE
-
-    print("Triangle value", ox.PrimitiveTypeFlags.TRIANGLE.value)
     pipeline_opts = ox.PipelineCompileOptions(
             uses_motion_blur=False,
-            uses_primitive_type_flags =ox.PrimitiveTypeFlags.TRIANGLE,
+            uses_primitive_type_flags=ox.PrimitiveTypeFlags.TRIANGLE,
             traversable_graph_flags=ox.TraversableGraphFlags.ALLOW_SINGLE_LEVEL_INSTANCING,
             exception_flags=exception_flags,
             num_payload_values=3,
@@ -353,8 +359,7 @@ def create_module(state):
 
     compile_opts = ox.ModuleCompileOptions(
             max_register_count=ox.ModuleCompileOptions.DEFAULT_MAX_REGISTER_COUNT,
-            opt_level=ox.CompileOptimizationLevel.DEFAULT,
-            debug_level=ox.CompileDebugLevel.MODERATE)
+            opt_level=opt_level, debug_level=debug_level)
 
     cuda_source = os.path.join(script_dir, 'cuda', 'dynamic_geometry.cu')
     state.module = ox.Module(state.ctx, cuda_source, compile_opts, pipeline_opts)
@@ -371,7 +376,7 @@ def create_pipeline(state):
     program_grps = [state.raygen_grp, state.miss_grp, state.hit_grp]
 
     link_opts = ox.PipelineLinkOptions(max_trace_depth=1,
-                                       debug_level=ox.CompileDebugLevel.MODERATE)
+                                       debug_level=debug_level)
 
     pipeline = ox.Pipeline(state.ctx,
                            compile_options=state.pipeline_opts,
@@ -414,7 +419,7 @@ def create_sbt(state):
     animation_time = 1.0
 
     buffer_format = BufferImageFormat.UCHAR4
-    output_buffer_type = CudaOutputBufferType.CUDA_DEVICE
+    output_buffer_type = CudaOutputBufferType.enable_gl_interop()
 
     init_camera_state(state)
     create_context(state)
@@ -479,7 +484,7 @@ def create_sbt(state):
 
         glfw.swap_buffers(window)
 
-        state.params.subframe_index = state.params.subframe_index.item() + 1
+        state.params.subframe_index = state.params.subframe_index+ 1
 
     impl.shutdown()
     glfw.terminate()
diff --git a/examples/dynamic_materials.py b/examples/dynamic_materials.py
index 5619419..9303665 100644
--- a/examples/dynamic_materials.py
+++ b/examples/dynamic_materials.py
@@ -38,7 +38,9 @@ def __init__(self):
 
     def __getattribute__(self, name):
         if name in Params._params.keys():
-            return self.__dict__['handle'][name]
+            item = self.__dict__['handle'][name]
+            if isinstance(item, np.ndarray) and item.shape in ((0,), (1,)):
+                return item.item()
         else:
             return super().__getattribute__(name)
 
diff --git a/examples/hello.py b/examples/hello.py
index e79fdaa..a7f5422 100644
--- a/examples/hello.py
+++ b/examples/hello.py
@@ -1,15 +1,19 @@
+import os, sys, logging
 import optix as ox
 import cupy as cp
 import numpy as np
 from PIL import Image, ImageOps
-import logging
-import sys
+
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 log = logging.getLogger()
 
+script_dir = os.path.dirname(__file__)
+cuda_src = os.path.join(script_dir, "cuda", "hello.cu")
+
+
 def create_module(ctx, pipeline_opts):
     compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
-    module = ox.Module(ctx, 'cuda/hello.cu', compile_opts, pipeline_opts)
+    module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
     return module
 
 
diff --git a/examples/opacity_micromap.py b/examples/opacity_micromap.py
new file mode 100644
index 0000000..8478e93
--- /dev/null
+++ b/examples/opacity_micromap.py
@@ -0,0 +1,308 @@
+# 
+#  Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# 
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#   * Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   * Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#   * Neither the name of NVIDIA CORPORATION nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+# 
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+#  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# 
+# 
+# -----------------------------------------------------------------------------
+# 
+#  A simple demonstration of opacity micromaps.
+# 
+#  * A single quad, made of two triangles ABC and ACD is rendered with a
+#    transparent circular cutout at its center.
+#  * OMMs are applied to the two triangles to accelerate the evaluation of the
+#    opacity function during traversal.
+#  * As a preproces, OMM microtriangles are marked as either completely
+#    transparent, completely opaque, or unknown.
+#  * During traversal, rays that hit opaque or transparent regions of the OMM
+#    can skip the anyhit function.
+#  * Rays that hit 'unknown' regions of the OMM evaluate the anyhit to get
+#    accurate evaluation of the opacity function.
+#  * Regions of the micromap which are unknown are tinted a lighter color to
+#    visualize the regions which required anyhit evaluation.
+# 
+# -----------------------------------------------------------------------------
+
+
+import os, sys, logging, collections
+
+import cupy as cp
+import numpy as np
+import optix as ox
+from sutil.camera import Camera
+from PIL import Image, ImageOps
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+cuda_src = os.path.join(script_dir, "cuda", "opacity_micromap.cu")
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+log = logging.getLogger()
+
+DEBUG = False
+
+OMM_SUBDIV_LEVEL = 4
+NUM_TRIS         = 2
+DEFAULT_WIDTH    = 1024
+DEFAULT_HEIGHT   = 768
+CIRCLE_RADIUS = 0.75
+
+g_uvs = np.array([[[1.0, -1.0], [-1.0, -1.0], [-1.0, 1.0]],
+                  [[1.0, -1.0], [-1.0, 1.0], [1.0, 1.0]]], dtype=np.float32)
+
+d_uvs = cp.asarray(g_uvs)
+
+vertices = np.array([[-0.5, -0.5, 0.0],
+                     [ 0.5, -0.5, 0.0],
+                     [ 0.5,  0.5, 0.0],
+
+                     [-0.5, -0.5, 0.0],
+                     [ 0.5,  0.5, 0.0],
+                     [-0.5,  0.5, 0.0]], dtype=np.float32)
+
+
+class Params:
+    _params = collections.OrderedDict([
+            ('image',        'u8'),
+            ('image_width',  'u4'),
+            ('image_height', 'u4'),
+            ('cam_eye',      '3f4'),
+            ('camera_u',     '3f4'),
+            ('camera_v',     '3f4'),
+            ('camera_w',     '3f4'),
+            ('trav_handle', 'u8'),
+        ])
+
+    def __init__(self):
+        self.handle = ox.LaunchParamsRecord(names=tuple(self._params.keys()),
+                                            formats=tuple(self._params.values()))
+
+    def __getattribute__(self, name):
+        if name in Params._params.keys():
+            return self.__dict__['handle'][name]
+        else:
+            return super().__getattribute__(name)
+
+    def __setattr__(self, name, value):
+        if name in Params._params.keys():
+            self.handle[name] = value
+        elif name in {'handle'}:
+            super().__setattr__(name, value)
+        else:
+            raise AttributeError(name)
+
+    def __str__(self):
+        return '\n'.join(f'{k}:  {self.handle[k]}' for k in self._params)
+
+
+##------------------------------------------------------------------------------
+##
+## Helper Functions
+##
+##------------------------------------------------------------------------------
+
+
+def init_camera(params):
+    camera = Camera()
+    camera.eye = (0, 0, 1.5)
+    camera.look_at = (0, 0, 0)
+    camera.up = (0, 1, 3)
+    camera.fov_y = 45
+    camera.aspect_ratio = params.image_width / params.image_height
+
+    u, v, w = camera.uvw_frame()
+    params.camera_u = u
+    params.camera_v = v
+    params.camera_w = w
+    params.cam_eye = camera.eye
+
+
+def compute_uv(bary, uv0, uv1, uv2):
+    bary = bary[np.newaxis, :, np.newaxis]
+    return (1.0 - bary[..., 0] - bary[..., 1]) * uv0[:, np.newaxis] + bary[..., 0] * uv1[:, np.newaxis] + bary[..., 1] * uv2[:, np.newaxis]
+
+
+def in_circle(uv, radius):
+    return (uv[..., 0] * uv[..., 0] + uv[..., 1] * uv[..., 1]) < (radius * radius)
+
+
+def evaluate_opacity(bary0, bary1, bary2, uvs, radius):
+    """
+    Calculate the texture coordinate at the micromesh vertices of the triangle and
+    determine if the triangle is inside, outside, or spanning the boundary of the circle.
+    Note that the tex coords are in [-1, 1] and the circle is centered at uv=(0,0).
+    """
+
+    uv0 = compute_uv(bary0, uvs[:, 0, :], uvs[:, 1, :], uvs[:, 2, :])
+    uv1 = compute_uv(bary1, uvs[:, 0, :], uvs[:, 1, :], uvs[:, 2, :])
+    uv2 = compute_uv(bary2, uvs[:, 0, :], uvs[:, 1, :], uvs[:, 2, :])
+
+    in_circle0 = in_circle(uv0, radius)
+    in_circle1 = in_circle(uv1, radius)
+    in_circle2 = in_circle(uv2, radius)
+
+    opacity = np.full_like(in_circle0, dtype=np.uint8, fill_value=ox.OpacityMicromapState.UNKNOWN_OPAQUE)
+
+    transparent = np.logical_and.reduce((in_circle0, in_circle1, in_circle2))
+    opaque = np.logical_and.reduce((~in_circle0, ~in_circle1, ~in_circle2))
+    opacity[transparent] = ox.OpacityMicromapState.TRANSPARENT
+    opacity[opaque] = ox.OpacityMicromapState.OPAQUE
+
+    return opacity
+
+
+def create_opacity_micromap(ctx):
+    NUM_MICRO_TRIS = 1 << (OMM_SUBDIV_LEVEL * 2)
+
+    # this has to be compressed later
+    uTriIs = np.arange(0, NUM_MICRO_TRIS, dtype=np.uint32)
+
+    bary0, bary1, bary2 = ox.micromap_indices_to_base_barycentrics(uTriIs, OMM_SUBDIV_LEVEL)
+    omm_input_data = evaluate_opacity(bary0, bary1, bary2, g_uvs, CIRCLE_RADIUS)
+
+    # construct the omm input from the data array (this will also bake the data into the format required by optix)
+    omm_input = ox.OpacityMicromapInput(omm_input_data, format=ox.OpacityMicromapFormat.FOUR_STATE)
+    omm = ox.OpacityMicromapArray(ctx, omm_input)
+    return omm
+
+
+def create_acceleration_structure(ctx, vertices, omm):
+    usage_counts = [2]
+    index_buffer = np.array([0, 1], dtype=np.uint16)
+    omm_build_input = ox.BuildInputOpacityMicromap(omm, usage_counts,
+                                                   ox.OpacityMicromapArrayIndexingMode.INDEXED,
+                                                   index_buffer=index_buffer)
+    triangle_input = ox.BuildInputTriangleArray(vertices,
+                                                flags=[ox.GeometryFlags.NONE],
+                                                opacity_micromap=omm_build_input)
+    gas = ox.AccelerationStructure(ctx, triangle_input, compact=True)
+    return gas
+
+
+def create_module(ctx, pipeline_opts):
+    compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.NONE, opt_level=ox.CompileOptimizationLevel.LEVEL_3)
+    module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
+    return module
+
+
+def create_program_groups(ctx, module):
+    raygen_grp = ox.ProgramGroup.create_raygen(ctx, module, "__raygen__rg")
+    miss_grp = ox.ProgramGroup.create_miss(ctx, module, "__miss__ms")
+    hit_grp = ox.ProgramGroup.create_hitgroup(ctx, module,
+                                              entry_function_CH="__closesthit__ch",
+                                              entry_function_AH="__anyhit__opacity")
+    return raygen_grp, miss_grp, hit_grp
+
+
+def create_pipeline(ctx, program_grps, pipeline_options):
+    link_opts = ox.PipelineLinkOptions(max_trace_depth=1,
+                                       debug_level=ox.CompileDebugLevel.NONE)
+
+    pipeline = ox.Pipeline(ctx,
+                           compile_options=pipeline_options,
+                           link_options=link_opts,
+                           program_groups=program_grps)
+    pipeline.compute_stack_sizes(1,  # max_trace_depth
+                                 0,  # max_cc_depth
+                                 0)  # max_dc_depth
+
+    return pipeline
+
+
+def create_sbt(program_grps):
+    raygen_grp, miss_grp, hit_grp = program_grps
+
+    raygen_sbt = ox.SbtRecord(raygen_grp)
+    miss_sbt = ox.SbtRecord(miss_grp, names=('bg_color',), formats=('3f4',))
+    miss_sbt['bg_color'] = [0.01, 0.01, 0.01]
+
+    hit_sbt = ox.SbtRecord(hit_grp, names=('uvs',), formats=('u8',))
+    hit_sbt["uvs"] = d_uvs.data.ptr
+
+    sbt = ox.ShaderBindingTable(raygen_record=raygen_sbt, miss_records=miss_sbt, hitgroup_records=hit_sbt)
+
+    return sbt
+
+
+def launch_pipeline(pipeline: ox.Pipeline, sbt, params):
+    img_size = (params.image_width[0], params.image_height[0])
+    output_image = np.zeros(img_size + (4, ), 'B')
+    output_image = cp.asarray(output_image)
+
+    params.image = output_image.data.ptr
+
+    stream = cp.cuda.Stream()
+    pipeline.launch(sbt, dimensions=img_size, params=params.handle, stream=stream)
+
+    stream.synchronize()
+    return cp.asnumpy(output_image)
+
+
+if __name__ == "__main__":
+    logger = ox.Logger(log)
+    ctx = ox.DeviceContext(validation_mode=True,
+                           log_callback_function=logger,
+                           log_callback_level=4)
+    ctx.cache_enabled = False
+
+    params = Params()
+    params.image_width = DEFAULT_WIDTH
+    params.image_height = DEFAULT_HEIGHT
+
+    init_camera(params)
+
+    opacity_micromap = create_opacity_micromap(ctx)
+    gas = create_acceleration_structure(ctx, vertices, opacity_micromap)
+    params.trav_handle = gas.handle
+
+    pipeline_options = ox.PipelineCompileOptions(traversable_graph_flags=ox.TraversableGraphFlags.ALLOW_SINGLE_GAS,
+                                                 num_payload_values=4,
+                                                 num_attribute_values=2,
+                                                 uses_motion_blur=False,
+                                                 exception_flags=ox.ExceptionFlags.NONE,
+                                                 uses_primitive_type_flags=ox.PrimitiveTypeFlags.TRIANGLE,
+                                                 pipeline_launch_params_variable_name="params",
+                                                 allow_opacity_micromaps=True)
+
+    module = create_module(ctx, pipeline_options)
+    program_grps = create_program_groups(ctx, module)
+    pipeline = create_pipeline(ctx, program_grps, pipeline_options)
+
+    sbt = create_sbt(program_grps)
+
+    img = launch_pipeline(pipeline, sbt, params)
+
+    img = img.reshape(params.image_height[0], params.image_width[0], 4)
+    img = ImageOps.flip(Image.fromarray(img, 'RGBA'))
+    img.show()
+
+
+
+
+
+
+
+
+
diff --git a/examples/spheres.py b/examples/spheres.py
index a7776e9..c2b55d3 100644
--- a/examples/spheres.py
+++ b/examples/spheres.py
@@ -1,11 +1,15 @@
+import os, sys, logging
 import optix as ox
 import cupy as cp
 import numpy as np
 from PIL import Image, ImageOps
-import logging
-import sys
+
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 log = logging.getLogger()
+
+script_dir = os.path.dirname(__file__)
+cuda_src = os.path.join(script_dir, "cuda", "spheres.cu")
+
 img_size = (1024, 768)
 
 def compute_spheres_bbox(centers, radii):
@@ -23,7 +27,7 @@ def create_acceleration_structure(ctx, bboxes):
 
 def create_module(ctx, pipeline_opts):
     compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
-    module = ox.Module(ctx, 'cuda/spheres.cu', compile_opts, pipeline_opts)
+    module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
     return module
 
 
diff --git a/examples/sutil/cuda_output_buffer.py b/examples/sutil/cuda_output_buffer.py
index 059fa6d..d09353f 100644
--- a/examples/sutil/cuda_output_buffer.py
+++ b/examples/sutil/cuda_output_buffer.py
@@ -1,4 +1,5 @@
-import enum
+import sys, os, enum
+from packaging import version
 
 import numpy as np
 import cupy as cp
@@ -7,6 +8,49 @@
 
 from .vecmath import vtype_to_dtype
 
+try:
+    import cuda as _cuda
+    from cuda import cudart
+    has_cudart = True
+    has_gl_interop = version.parse(_cuda.__version__) >= version.parse("11.6.0")
+except ImportError:
+    cudart = None
+    has_cudart = False
+    has_gl_interop = False
+
+_cuda_opengl_interop_msg = (
+    "Cuda Python low level bindings v11.6.0 or later are required to enable "
+   f"Cuda/OpenGL interoperability.{os.linesep}You can install the missing package with:"
+   f"{os.linesep}  {sys.executable} -m pip install --upgrade --user cuda-python"
+)
+
+if has_cudart:
+    def format_cudart_err(err):
+        return (
+            f"{cudart.cudaGetErrorName(err)[1].decode('utf-8')}({int(err)}): "
+            f"{cudart.cudaGetErrorString(err)[1].decode('utf-8')}"
+        )
+
+
+    def check_cudart_err(args):
+        if isinstance(args, tuple):
+            assert len(args) >= 1
+            err = args[0]
+            if len(args) == 1:
+                ret = None
+            elif len(args) == 2:
+                ret = args[1]
+            else:
+                ret = args[1:]
+        else:
+            ret = None
+
+        assert isinstance(err, cudart.cudaError_t), type(err)
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise RuntimeError(format_cudart_err(err))
+
+        return ret
+
 
 class BufferImageFormat(enum.Enum):
     UCHAR4=0
@@ -35,11 +79,22 @@ class CudaOutputBufferType(enum.Enum):
     ZERO_COPY   = 2,  # general case, preferred for multi-gpu if not fully nvlink connected
     CUDA_P2P    = 3,  # fully connected only, preferred for fully nvlink connected
 
+    @classmethod
+    def enable_gl_interop(cls, fallback=True):
+        if has_gl_interop:
+            return cls.GL_INTEROP
+        elif fallback:
+            msg = _cuda_opengl_interop_msg + f"{os.linesep}Falling back to slower CUDA_DEVICE output buffer."
+            print(msg)
+            return cls.CUDA_DEVICE
+        else:
+            raise RuntimeError(_cuda_opengl_interop_msg)
+
 
 class CudaOutputBuffer:
     __slots__ = ['_pixel_format', '_buffer_type', '_width', '_height',
             '_device', '_device_idx', '_device', '_stream',
-            '_host_buffer', '_device_buffer', '_pbo']
+            '_host_buffer', '_device_buffer', '_cuda_gfx_ressource', '_pbo']
 
     def __init__(self, buffer_type, pixel_format, width, height, device_idx=0):
         for attr in self.__slots__:
@@ -50,6 +105,16 @@ def __init__(self, buffer_type, pixel_format, width, height, device_idx=0):
         self.buffer_type = buffer_type
         self.resize(width, height)
         self.stream = None
+        
+        if buffer_type is CudaOutputBufferType.GL_INTEROP:
+            if not has_gl_interop:
+                raise RuntimeError(_cuda_opengl_interop_msg)
+            device_count, device_ids = check_cudart_err( cudart.cudaGLGetDevices(1, cudart.cudaGLDeviceList.cudaGLDeviceListAll) )
+            if device_count <= 0:
+                raise RuntimeError("No OpenGL device found, cannot enable GL_INTEROP.")
+            elif device_ids[0] != device_idx:
+                raise RuntimeError(f"OpenGL device id {device_ids[0]} does not match requested "
+                                   f"device index {device_idx} for Cuda/OpenGL interop.")
 
         self._reallocate_buffers()
 
@@ -69,13 +134,29 @@ def map(self):
         self._make_current()
         if (self._host_buffer is None) or (self._device_buffer is None):
             self._reallocate_buffers()
-        return self._device_buffer.data.ptr
+        if self.buffer_type is CudaOutputBufferType.CUDA_DEVICE:
+            return self._device_buffer.data.ptr
+        elif self.buffer_type is CudaOutputBufferType.GL_INTEROP:
+            check_cudart_err(
+                cudart.cudaGraphicsMapResources(1, self._cuda_gfx_ressource, self._stream.ptr)
+            )
+            ptr, size = check_cudart_err(
+                cudart.cudaGraphicsResourceGetMappedPointer(self._cuda_gfx_ressource)
+            )
+            return ptr
+        else:
+            msg = f'Buffer type {self.buffer_type} has not been implemented yet.'
+            raise NotImplementedError(msg)
 
     def unmap(self):
         self._make_current()
         buffer_type = self.buffer_type
         if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
             self._stream.synchronize()
+        elif buffer_type is CudaOutputBufferType.GL_INTEROP:
+            check_cudart_err(
+                cudart.cudaGraphicsUnmapResources(1, self._cuda_gfx_ressource, self._stream.ptr)
+            )
         else:
             msg = f'Buffer type {buffer_type} has not been implemented yet.'
             raise NotImplementedError(msg)
@@ -85,12 +166,13 @@ def get_pbo(self):
 
         self._make_current()
 
-        if self._pbo is None:
-            self._pbo = gl.glGenBuffers(1)
-
         if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
+            if self._pbo is None:
+                self._pbo = gl.glGenBuffers(1)
             self.copy_device_to_host()
             self.copy_host_to_pbo()
+        elif buffer_type is CudaOutputBufferType.GL_INTEROP:
+            assert self._pbo is not None
         else:
             msg = f'Buffer type {buffer_type} has not been implemented yet.'
             raise NotImplementedError(msg)
@@ -121,14 +203,26 @@ def _reallocate_buffers(self):
 
         dtype = self.pixel_format
         shape = (self.height, self.width)
+        
+        self._host_buffer = np.empty(shape=shape, dtype=dtype)
 
         if buffer_type is CudaOutputBufferType.CUDA_DEVICE:
-            self._host_buffer = np.empty(shape=shape, dtype=dtype)
             self._device_buffer = cp.empty(shape=shape, dtype=dtype)
             if self._pbo is not None:
                 gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo)
                 gl.glBufferData(gl.GL_ARRAY_BUFFER, self._host_buffer, gl.GL_STREAM_DRAW)
                 gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
+        elif buffer_type is CudaOutputBufferType.GL_INTEROP:
+            self._pbo = gl.glGenBuffers(1) if self._pbo is None else self._pbo
+            
+            gl.glBindBuffer(gl.GL_ARRAY_BUFFER, self._pbo)
+            gl.glBufferData(gl.GL_ARRAY_BUFFER, self.width*self.height*dtype.itemsize, None, gl.GL_STREAM_DRAW)
+            gl.glBindBuffer(gl.GL_ARRAY_BUFFER, 0)
+        
+            self.cuda_gfx_ressource = check_cudart_err(
+                cudart.cudaGraphicsGLRegisterBuffer(self._pbo,
+                    cudart.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsWriteDiscard)
+            )
         else:
             msg = f'Buffer type {buffer_type} has not been implemented yet.'
             raise NotImplementedError(msg)
@@ -215,3 +309,15 @@ def _set_stream(self, value):
         assert isinstance(value, cp.cuda.Stream), type(value)
         self._stream = value
     stream = property(_get_stream, _set_stream)
+    
+    def _get_cuda_gfx_ressource(self):
+        assert self._cuda_gfx_ressource is not None
+        return self._cuda_gfx_ressource
+    def _set_cuda_gfx_ressource(self, value):
+        if (self._cuda_gfx_ressource is not None) and (self._cuda_gfx_ressource != value):
+            check_cudart_err(
+                cudart.cudaGraphicsUnregisterResource(self._cuda_gfx_ressource)
+            )
+        self._cuda_gfx_ressource = value
+
+    cuda_gfx_ressource = property(_get_cuda_gfx_ressource, _set_cuda_gfx_ressource)
diff --git a/examples/sutil/gl_display.py b/examples/sutil/gl_display.py
index bee2939..280cf1e 100644
--- a/examples/sutil/gl_display.py
+++ b/examples/sutil/gl_display.py
@@ -49,7 +49,6 @@ class GLDisplay:
             '_quad_vertex_buffer', '_image_format']
 
     def __init__(self, image_format):
-        print(image_format, type(image_format), isinstance(BufferImageFormat.UCHAR4, BufferImageFormat))
         assert isinstance(image_format, BufferImageFormat)
 
         vertex_array = gl.glGenVertexArrays(1)
diff --git a/examples/triangle.py b/examples/triangle.py
index b9d7bb3..2b319fa 100644
--- a/examples/triangle.py
+++ b/examples/triangle.py
@@ -1,8 +1,12 @@
+import os
 import optix as ox
 import cupy as cp
 import numpy as np
 from PIL import Image, ImageOps
 
+script_dir = os.path.dirname(__file__)
+cuda_src = os.path.join(script_dir, "cuda", "triangle.cu")
+
 img_size = (1024, 768)
 
 # use a regular function for logging
@@ -19,7 +23,7 @@ def create_acceleration_structure(ctx, vertices):
 
 def create_module(ctx, pipeline_opts):
     compile_opts = ox.ModuleCompileOptions(debug_level=ox.CompileDebugLevel.FULL, opt_level=ox.CompileOptimizationLevel.LEVEL_0)
-    module = ox.Module(ctx, 'cuda/triangle.cu', compile_opts, pipeline_opts)
+    module = ox.Module(ctx, cuda_src, compile_opts, pipeline_opts)
     return module
 
 
diff --git a/optix/__init__.py b/optix/__init__.py
index 9a364ce..41d1015 100644
--- a/optix/__init__.py
+++ b/optix/__init__.py
@@ -1,12 +1,12 @@
-from .context import DeviceContext, optix_version
+from .context import *
 from .build import *
-from .module import Module, ModuleCompileOptions, CompileOptimizationLevel, CompileDebugLevel, PayloadSemantics, Task
-from .program_group import ProgramGroup
-from .struct import SbtRecord, LaunchParamsRecord
-from .shader_binding_table import ShaderBindingTable
-from .pipeline import CompileDebugLevel, ExceptionFlags, TraversableGraphFlags, \
-    PrimitiveTypeFlags, PipelineCompileOptions, PipelineLinkOptions, Pipeline
+from .module import *
+from .program_group import *
+from .struct import *
+from .shader_binding_table import *
+from .pipeline import *
 from .denoiser import *
+from .micromap import *
 from .logging_utility import Logger
 from ._version import __version__
 
diff --git a/optix/_version.py b/optix/_version.py
index 3dc1f76..5becc17 100644
--- a/optix/_version.py
+++ b/optix/_version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "1.0.0"
diff --git a/optix/base.pyx b/optix/base.pyx
index 0173530..d9fc39a 100644
--- a/optix/base.pyx
+++ b/optix/base.pyx
@@ -9,4 +9,4 @@ cdef class OptixObject:
         return ""
 
     def __repr__(self):
-        return f"<optix.{self.__class__.__name__}({self._repr_details()})>"
\ No newline at end of file
+        return f"optix.{self.__class__.__name__}({self._repr_details()})"
\ No newline at end of file
diff --git a/optix/build.pxd b/optix/build.pxd
index 895d613..74570bb 100644
--- a/optix/build.pxd
+++ b/optix/build.pxd
@@ -2,7 +2,8 @@ from .common cimport OptixResult, CUstream, CUdeviceptr
 from .context cimport OptixDeviceContext, OptixContextObject
 from libcpp.vector cimport vector
 from .base cimport OptixObject
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uintptr_t, uint32_t
+from .micromap cimport OptixBuildInputOpacityMicromap, BuildInputOpacityMicromap
 
 
 cdef extern from "optix.h" nogil:
@@ -14,6 +15,8 @@ cdef extern from "optix.h" nogil:
         OPTIX_BUILD_FLAG_PREFER_FAST_BUILD,
         OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS,
         OPTIX_BUILD_FLAG_ALLOW_RANDOM_INSTANCE_ACCESS,
+        OPTIX_BUILD_FLAG_ALLOW_OPACITY_MICROMAP_UPDATE,
+        OPTIX_BUILD_FLAG_ALLOW_DISABLE_OPACITY_MICROMAPS
 
 
     cdef enum OptixBuildOperation:
@@ -33,24 +36,27 @@ cdef extern from "optix.h" nogil:
         float timeBegin
         float timeEnd
 
+
     cdef struct OptixAccelBuildOptions:
         unsigned int buildFlags
         OptixBuildOperation operation
         OptixMotionOptions motionOptions
 
+
     cdef enum OptixBuildInputType:
         OPTIX_BUILD_INPUT_TYPE_TRIANGLES,
         OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES,
         OPTIX_BUILD_INPUT_TYPE_INSTANCES,
         OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS,
-        OPTIX_BUILD_INPUT_TYPE_CURVES
+        OPTIX_BUILD_INPUT_TYPE_CURVES,
+        OPTIX_BUILD_INPUT_TYPE_SPHERES
 
-    #ctypedef uintptr_t CUdeviceptr
 
     cdef struct OptixBuildInputInstanceArray:
         CUdeviceptr instances
         unsigned int numInstances
 
+
     cdef struct OptixAabb:
         float minX
         float minY
@@ -59,6 +65,7 @@ cdef extern from "optix.h" nogil:
         float maxY
         float maxZ
 
+
     cdef struct OptixBuildInputCustomPrimitiveArray:
         const CUdeviceptr * aabbBuffers
         unsigned int numPrimitives
@@ -70,62 +77,45 @@ cdef extern from "optix.h" nogil:
         unsigned int sbtIndexOffsetStrideInBytes
         unsigned int primitiveIndexOffset
 
-    IF _OPTIX_VERSION_MAJOR == 7 and _OPTIX_VERSION_MINOR > 3:  # switch to new instance flags
-        cdef enum OptixPrimitiveType:
-            OPTIX_PRIMITIVE_TYPE_CUSTOM,
-            OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR,
-            OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM,
-            OPTIX_PRIMITIVE_TYPE_TRIANGLE,
-
-        cdef enum OptixCurveEndcapFlags:
-            OPTIX_CURVE_ENDCAP_DEFAULT,
-            OPTIX_CURVE_ENDCAP_ON
-
-        cdef struct OptixBuildInputCurveArray:
-            OptixPrimitiveType curveType
-            unsigned int numPrimitives
-            const CUdeviceptr * vertexBuffers
-            unsigned int numVertices
-            unsigned int vertexStrideInBytes
-            const CUdeviceptr * widthBuffers
-            unsigned int widthStrideInBytes
-            const CUdeviceptr * normalBuffers
-            unsigned int normalStrideInBytes
-            CUdeviceptr indexBuffer
-            unsigned int indexStrideInBytes
-            unsigned int flag
-            unsigned int primitiveIndexOffset
-            unsigned int endcapFlags
-    ELSE:
-        cdef enum OptixPrimitiveType:
-            OPTIX_PRIMITIVE_TYPE_CUSTOM,
-            OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR,
-            OPTIX_PRIMITIVE_TYPE_TRIANGLE,
-
-        cdef struct OptixBuildInputCurveArray:
-            OptixPrimitiveType curveType
-            unsigned int numPrimitives
-            const CUdeviceptr * vertexBuffers
-            unsigned int numVertices
-            unsigned int vertexStrideInBytes
-            const CUdeviceptr * widthBuffers
-            unsigned int widthStrideInBytes
-            const CUdeviceptr * normalBuffers
-            unsigned int normalStrideInBytes
-            CUdeviceptr indexBuffer
-            unsigned int indexStrideInBytes
-            unsigned int flag
-            unsigned int primitiveIndexOffset
+
+    cdef enum OptixPrimitiveType:
+        OPTIX_PRIMITIVE_TYPE_CUSTOM,
+        OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE,
+        OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE,
+        OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR,
+        OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM,
+        OPTIX_PRIMITIVE_TYPE_SPHERE,
+        OPTIX_PRIMITIVE_TYPE_TRIANGLE,
+
+
+    cdef enum OptixCurveEndcapFlags:
+        OPTIX_CURVE_ENDCAP_DEFAULT,
+        OPTIX_CURVE_ENDCAP_ON
+
+
+    cdef struct OptixBuildInputCurveArray:
+        OptixPrimitiveType curveType
+        unsigned int numPrimitives
+        const CUdeviceptr * vertexBuffers
+        unsigned int numVertices
+        unsigned int vertexStrideInBytes
+        const CUdeviceptr * widthBuffers
+        unsigned int widthStrideInBytes
+        const CUdeviceptr * normalBuffers
+        unsigned int normalStrideInBytes
+        CUdeviceptr indexBuffer
+        unsigned int indexStrideInBytes
+        unsigned int flag
+        unsigned int primitiveIndexOffset
+        unsigned int endcapFlags
+
 
     cdef enum OptixIndicesFormat:
         OPTIX_INDICES_FORMAT_NONE,
         OPTIX_INDICES_FORMAT_UNSIGNED_SHORT3,
         OPTIX_INDICES_FORMAT_UNSIGNED_INT3
 
+
     cdef enum OptixVertexFormat:
         OPTIX_VERTEX_FORMAT_NONE,
         OPTIX_VERTEX_FORMAT_FLOAT3,
@@ -135,14 +125,18 @@ cdef extern from "optix.h" nogil:
         OPTIX_VERTEX_FORMAT_SNORM16_3,
         OPTIX_VERTEX_FORMAT_SNORM16_2
 
+
     cdef enum OptixTransformFormat:
         OPTIX_TRANSFORM_FORMAT_NONE,
         OPTIX_TRANSFORM_FORMAT_MATRIX_FLOAT12,
 
+
     cdef enum OptixGeometryFlags:
         OPTIX_GEOMETRY_FLAG_NONE,
         OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT,
         OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL
+        OPTIX_GEOMETRY_FLAG_DISABLE_TRIANGLE_FACE_CULLING
+
 
     cdef struct OptixBuildInputTriangleArray:
         const CUdeviceptr * vertexBuffers
@@ -161,33 +155,57 @@ cdef extern from "optix.h" nogil:
         unsigned int sbtIndexOffsetStrideInBytes
         unsigned int primitiveIndexOffset
         OptixTransformFormat transformFormat
+        OptixBuildInputOpacityMicromap opacityMicromap
+
+
+    cdef struct OptixBuildInputSphereArray:
+        const CUdeviceptr* vertexBuffers
+        unsigned int vertexStrideInBytes
+        unsigned int numVertices
+        const CUdeviceptr *radiusBuffers
+        unsigned int radiusStrideInBytes
+        int singleRadius
+        const unsigned int *flags
+        unsigned int numSbtRecords
+        CUdeviceptr sbtIndexOffsetBuffer
+        unsigned int sbtIndexOffsetSizeInBytes
+        unsigned int sbtIndexOffsetStrideInBytes
+        unsigned int primitiveIndexOffset
+
 
     cdef struct OptixBuildInput:
         OptixBuildInputType type
         # union
         OptixBuildInputTriangleArray triangleArray
         OptixBuildInputCurveArray curveArray
+        OptixBuildInputSphereArray   sphereArray
         OptixBuildInputCustomPrimitiveArray customPrimitiveArray
         OptixBuildInputInstanceArray instanceArray
 
+
     cdef struct OptixAccelBufferSizes:
         size_t outputSizeInBytes
         size_t tempSizeInBytes
         size_t tempUpdateSizeInBytes
 
+
     cdef enum OptixAccelPropertyType:
         OPTIX_PROPERTY_TYPE_COMPACTED_SIZE,
         OPTIX_PROPERTY_TYPE_AABBS,
 
+
     cdef struct OptixAccelEmitDesc:
         CUdeviceptr result
         OptixAccelPropertyType type
 
+
     ctypedef uintptr_t OptixTraversableHandle
 
-    cdef struct OptixAccelRelocationInfo:
+
+    cdef struct OptixRelocationInfo:
         unsigned long long info[4]
 
+
     cdef enum OptixTraversableType:
         OPTIX_TRAVERSABLE_TYPE_STATIC_TRANSFORM,
         OPTIX_TRAVERSABLE_TYPE_MATRIX_MOTION_TRANSFORM,
@@ -200,6 +218,8 @@ cdef extern from "optix.h" nogil:
         OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING
         OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT
         OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT
+        OPTIX_INSTANCE_FLAG_FORCE_OPACITY_MICROMAP_2_STATE
+        OPTIX_INSTANCE_FLAG_DISABLE_OPACITY_MICROMAPS
 
 
     cdef struct OptixInstance:
@@ -210,6 +230,27 @@ cdef extern from "optix.h" nogil:
         unsigned int flags
         OptixTraversableHandle traversableHandle
 
+
+    cdef struct OptixRelocateInputInstanceArray:
+        unsigned int numInstances
+        CUdeviceptr traversableHandles
+
+
+    cdef struct OptixRelocateInputOpacityMicromap:
+        CUdeviceptr opacityMicromapArray
+
+
+    cdef struct OptixRelocateInputTriangleArray:
+        unsigned int numSbtRecords
+        OptixRelocateInputOpacityMicromap opacityMicromap
+
+
+    cdef struct OptixRelocateInput:
+        OptixBuildInputType type
+        OptixRelocateInputInstanceArray instanceArray
+        OptixRelocateInputTriangleArray triangleArray
+
+
     OptixResult optixAccelComputeMemoryUsage(OptixDeviceContext context,
                                  const OptixAccelBuildOptions * accelOptions,
                                  const OptixBuildInput * buildInputs,
@@ -232,6 +273,7 @@ cdef extern from "optix.h" nogil:
                     unsigned int numEmittedProperties
                     )
 
+
     OptixResult optixAccelCompact(OptixDeviceContext context,
                                   CUstream stream,
                                   OptixTraversableHandle inputHandle,
@@ -240,33 +282,39 @@ cdef extern from "optix.h" nogil:
                                   OptixTraversableHandle * outputHandle
                                   )
 
+
     OptixResult optixAccelRelocate(OptixDeviceContext context,
                        CUstream stream,
-                       const OptixAccelRelocationInfo * info,
-                       CUdeviceptr instanceTraversableHandles,
-                       size_t numInstanceTraversableHandles,
+                       const OptixRelocationInfo * info,
+                       const OptixRelocateInput * relocateInputs,
+                       size_t numRelocateInputs,
                        CUdeviceptr targetAccel,
                        size_t targetAccelSizeInBytes,
                        OptixTraversableHandle * targetHandle
                        )
 
-    OptixResult optixAccelCheckRelocationCompatibility(OptixDeviceContext context,
-                                           const OptixAccelRelocationInfo * info,
+
+    OptixResult optixCheckRelocationCompatibility(OptixDeviceContext context,
+                                           const OptixRelocationInfo * info,
                                            int * compatible
                                            )
 
+
     OptixResult optixAccelGetRelocationInfo(OptixDeviceContext context,
                                 OptixTraversableHandle handle,
-                                OptixAccelRelocationInfo * info
+                                OptixRelocationInfo * info
                                 )
 
+
     OptixResult optixConvertPointerToTraversableHandle(OptixDeviceContext onDevice,
                                            CUdeviceptr pointer,
                                            OptixTraversableType traversableType,
                                            OptixTraversableHandle * traversableHandle
                                            )
 
+
 cdef class BuildInputArray(OptixObject):
+    cdef OptixBuildInputType build_input_type
     cdef void prepare_build_input(self, OptixBuildInput* build_input) except *
     cdef size_t num_elements(self)
 
@@ -279,6 +327,7 @@ cdef class BuildInputTriangleArray(BuildInputArray):
     cdef object _d_sbt_offset_buffer
     cdef object _d_pre_transform
     cdef vector[unsigned int] _flags
+    cdef BuildInputOpacityMicromap c_opacity_micromap
 
 
 cdef class BuildInputCustomPrimitiveArray(BuildInputArray):
@@ -301,6 +350,16 @@ cdef class BuildInputCurveArray(BuildInputArray):
     cdef object _d_index_buffer
 
 
+cdef class BuildInputSphereArray(BuildInputArray):
+    cdef OptixBuildInputSphereArray build_input
+    cdef list _d_vertex_buffers
+    cdef vector[CUdeviceptr] _d_vertex_buffer_ptrs
+    cdef list _d_radius_buffers
+    cdef vector[CUdeviceptr] _d_radius_buffer_ptrs
+    cdef object _d_sbt_offset_buffer
+    cdef vector[unsigned int] _flags
+
+
 cdef class Instance(OptixObject):
     cdef OptixInstance instance
     cdef AccelerationStructure _traversable
@@ -319,6 +378,8 @@ cdef class AccelerationStructure(OptixContextObject):
     cdef OptixAccelBufferSizes _buffer_sizes
     cdef object _instances
     cdef OptixTraversableHandle _handle
+    cdef list _relocate_deps
+
     cdef void _init_build_inputs(self, build_inputs, vector[OptixBuildInput]& ret)
     cdef void _init_accel_options(self, size_t num_build_inputs, unsigned int build_flags, OptixBuildOperation operation, vector[OptixAccelBuildOptions]& ret)
     cdef void build(self, build_inputs, stream=*)
diff --git a/optix/build.pyx b/optix/build.pyx
index 4cdb66a..a4d9fa0 100644
--- a/optix/build.pyx
+++ b/optix/build.pyx
@@ -8,6 +8,8 @@ from enum import IntEnum, IntFlag
 from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
 from .common import round_up, ensure_iterable
+import typing as typ
+from .micromap cimport BuildInputOpacityMicromap, OpacityMicromapArray
 
 optix_init()
 
@@ -17,12 +19,14 @@ __all__ = ['GeometryFlags',
            'BuildInputTriangleArray',
            'BuildInputCustomPrimitiveArray',
            'BuildInputCurveArray',
+           'BuildInputSphereArray',
            'BuildInputInstanceArray',
            'Instance',
            'AccelerationStructure',
-           'CurveEndcapFlags'
+           'CurveEndcapFlags',
            ]
 
+
 class GeometryFlags(IntEnum):
     """
     Wraps the OptixGeometryFlags enum.
@@ -30,6 +34,7 @@ class GeometryFlags(IntEnum):
     NONE = OPTIX_GEOMETRY_FLAG_NONE,
     DISABLE_ANYHIT = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT,
     REQUIRE_SINGLE_ANYHIT_CALL = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL
+    DISABLE_TRIANGLE_FACE_CULLING = OPTIX_GEOMETRY_FLAG_DISABLE_TRIANGLE_FACE_CULLING
 
 
 class BuildFlags(IntFlag):
@@ -45,34 +50,22 @@ class BuildFlags(IntFlag):
     ALLOW_RANDOM_INSTANCE_ACCESS = OPTIX_BUILD_FLAG_ALLOW_RANDOM_INSTANCE_ACCESS,
 
 
-IF _OPTIX_VERSION > 70300:  # switch to new instance flags
-    class PrimitiveType(IntEnum):
-        """
-        Wraps the OptixPrimitiveType enum.
-        """
-        CUSTOM = OPTIX_PRIMITIVE_TYPE_CUSTOM,
-        ROUND_QUADRATIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE,
-        ROUND_CUBIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE,
-        ROUND_LINEAR = OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR,
-        ROUND_CATMULLROM = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM,
-        TRIANGLE = OPTIX_PRIMITIVE_TYPE_TRIANGLE
-
-    class CurveEndcapFlags(IntEnum):
-        DEFAULT = OPTIX_CURVE_ENDCAP_DEFAULT,
-        ON = OPTIX_CURVE_ENDCAP_ON
-ELSE:
-    class CurveEndcapFlags(IntEnum):
-        DEFAULT = 0 # only for interface. Ignored for Optix versions below 7.4
-
-    class PrimitiveType(IntEnum):
-        """
-        Wraps the OptixPrimitiveType enum.
-        """
-        CUSTOM = OPTIX_PRIMITIVE_TYPE_CUSTOM,
-        ROUND_QUADRATIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE,
-        ROUND_CUBIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE,
-        ROUND_LINEAR = OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR,
-        TRIANGLE = OPTIX_PRIMITIVE_TYPE_TRIANGLE
+class PrimitiveType(IntEnum):
+    """
+    Wraps the OptixPrimitiveType enum.
+    """
+    CUSTOM = OPTIX_PRIMITIVE_TYPE_CUSTOM,
+    ROUND_QUADRATIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_ROUND_QUADRATIC_BSPLINE,
+    ROUND_CUBIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE,
+    ROUND_LINEAR = OPTIX_PRIMITIVE_TYPE_ROUND_LINEAR
+    ROUND_CATMULLROM = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM
+    SPHERE = OPTIX_PRIMITIVE_TYPE_SPHERE
+    TRIANGLE = OPTIX_PRIMITIVE_TYPE_TRIANGLE
+
+
+class CurveEndcapFlags(IntEnum):
+    DEFAULT = OPTIX_CURVE_ENDCAP_DEFAULT,
+    ON = OPTIX_CURVE_ENDCAP_ON
 
 
 class InstanceFlags(IntFlag):
@@ -84,18 +77,36 @@ class InstanceFlags(IntFlag):
     FLIP_TRIANGLE_FACING = OPTIX_INSTANCE_FLAG_FLIP_TRIANGLE_FACING,
     DISABLE_ANYHIT = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT,
     ENFORCE_ANYHIT = OPTIX_INSTANCE_FLAG_ENFORCE_ANYHIT,
+    FORCE_OPACITY_MICROMAP_2_STATE = OPTIX_INSTANCE_FLAG_FORCE_OPACITY_MICROMAP_2_STATE,
+    DISABLE_OPACITY_MICROMAPS = OPTIX_INSTANCE_FLAG_DISABLE_OPACITY_MICROMAPS
+
+
+class BuildInputType(IntEnum):
+    TRIANGLES = OPTIX_BUILD_INPUT_TYPE_TRIANGLES,
+    CUSTOM_PRIMITIVES = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES,
+    INSTANCES = OPTIX_BUILD_INPUT_TYPE_INSTANCES,
+    INSTANCE_POINTERS = OPTIX_BUILD_INPUT_TYPE_INSTANCE_POINTERS,
+    CURVES = OPTIX_BUILD_INPUT_TYPE_CURVES,
+    SPHERES = OPTIX_BUILD_INPUT_TYPE_SPHERES
 
 
 cdef class BuildInputArray(OptixObject):
     """
     Base class for all BuildInput Arrays. This is an internal class.
     """
+    def __init__(self, type):
+        self.build_input_type = <OptixBuildInputType><int>(BuildInputType(type).value)
+
     cdef void prepare_build_input(self, OptixBuildInput* build_input) except *:
         pass
 
     cdef size_t num_elements(self):
         return 0
 
+    @property
+    def type(self):
+        return BuildInputType(self.build_input_type)
+
 
 cdef class BuildInputTriangleArray(BuildInputArray):
     """
@@ -128,9 +139,9 @@ cdef class BuildInputTriangleArray(BuildInputArray):
                  flags = None,
                  sbt_record_offset_buffer = None,
                  pre_transform = None,
-                 primitive_index_offset = 0
-                 ):
-
+                 primitive_index_offset = 0,
+                 opacity_micromap: typ.Optional[BuildInputOpacityMicromap] = None):
+        super().__init__(BuildInputType.TRIANGLES)
         self._d_vertex_buffers = [cp.asarray(vb) for vb in ensure_iterable(vertex_buffers)]
         self._d_vertex_buffer_ptrs.reserve(len(self._d_vertex_buffers))
 
@@ -164,6 +175,7 @@ cdef class BuildInputTriangleArray(BuildInputArray):
             self.build_input.indexStrideInBytes = 0
             self.build_input.numIndexTriplets = 0
             self.build_input.indexBuffer = 0
+            self._d_index_buffer = None
 
         self.build_input.numSbtRecords = num_sbt_records
 
@@ -201,11 +213,16 @@ cdef class BuildInputTriangleArray(BuildInputArray):
             self.build_input.preTransform = 0
             self.build_input.transformFormat = OPTIX_TRANSFORM_FORMAT_NONE
 
+        self.c_opacity_micromap = opacity_micromap
+        if self.c_opacity_micromap is not None:
+            self.build_input.opacityMicromap = self.c_opacity_micromap.build_input
+
+
     def __dealloc__(self):
         pass
 
     cdef void prepare_build_input(self, OptixBuildInput* build_input) except *:
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES
+        build_input.type = self.build_input_type
         build_input.triangleArray = self.build_input
 
     def _vertex_format(self, dtype, shape):
@@ -239,6 +256,19 @@ cdef class BuildInputTriangleArray(BuildInputArray):
     cdef size_t num_elements(self):
         return self.build_input.numVertices
 
+    @property
+    def micromap(self):
+        return self.c_opacity_micromap
+
+    @property
+    def num_sbt_records(self):
+        return self.build_input.numSbtRecords
+
+
+    def _repr_details(self):
+        return f"nvertices={self.num_elements()}, " \
+               f"ntriangles={self._d_index_buffer.shape[0] if self._d_index_buffer is not None else self.num_elements() // 3}, " \
+               f"n_sbt_records={self.build_input.numSbtRecords}"
 
 cdef class BuildInputCustomPrimitiveArray(BuildInputArray):
     """
@@ -268,6 +298,7 @@ cdef class BuildInputCustomPrimitiveArray(BuildInputArray):
                  sbt_record_offset_buffer = None,
                  primitive_index_offset = 0
                  ):
+        super().__init__(BuildInputType.CUSTOM_PRIMITIVES)
         self._d_aabb_buffers = [cp.asarray(ab, dtype=np.float32).reshape(-1, 6) for ab in aabb_buffers]
         self._d_aabb_buffer_ptrs.reserve(len(self._d_aabb_buffers))
 
@@ -318,14 +349,13 @@ cdef class BuildInputCustomPrimitiveArray(BuildInputArray):
         self.build_input.primitiveIndexOffset = primitive_index_offset
 
     cdef void prepare_build_input(self, OptixBuildInput * build_input) except *:
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES
+        build_input.type = self.build_input_type
         build_input.customPrimitiveArray = self.build_input
 
     cdef size_t num_elements(self):
         return self.build_input.numPrimitives
 
 
-
 cdef class BuildInputCurveArray(BuildInputArray):
     """
     BuildInputArray for curve inputs. This class wraps the OptixBuildInputCurveArray struct.
@@ -360,7 +390,7 @@ cdef class BuildInputCurveArray(BuildInputArray):
                  flags=None,
                  primitive_index_offset=0,
                  endcap_flags=CurveEndcapFlags.DEFAULT):
-
+        super().__init__(BuildInputType.CURVES)
         self.build_input.curveType = curve_type.value
         self._d_vertex_buffers = [cp.asarray(vb, np.float32) for vb in ensure_iterable(vertex_buffers)]
         self._d_vertex_buffer_ptrs.reserve(len(self._d_vertex_buffers))
@@ -419,17 +449,124 @@ cdef class BuildInputCurveArray(BuildInputArray):
 
         self.build_input.primitiveIndexOffset = primitive_index_offset
 
-        IF _OPTIX_VERSION > 70300:
-            self.build_input.endcapFlags = endcap_flags # only for Optix versions >= 7.4
+        self.build_input.endcapFlags = endcap_flags
 
     cdef void prepare_build_input(self, OptixBuildInput * build_input) except *:
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES
+        build_input.type = self.build_input_type
         build_input.curveArray = self.build_input
 
     cdef size_t num_elements(self):
         return self.build_input.numPrimitives
 
 
+cdef class BuildInputSphereArray(BuildInputArray):
+    """
+        BuildInputArray for a sphere. This class wraps the OptixBuildInputSphereArray struct.
+        In Contrast to the behavior of the Optix C++ API, this Python class will automatically convert all numpy.ndarrays
+        to cupy.ndarrays and keep track of them.
+
+        Parameters
+        ----------
+        vertex_buffers:
+            List of vertex buffers (one for each motion step) or a single array.
+            All arrays will be converted to cupy.ndarrays before any further processing.
+        index_buffer: ndarray, optional
+            A single 2d array containing the indices of all triangles or None
+        num_sbt_records: int
+            The number of records in the ShaderBindingTable for this geometry
+        flags: GeometryFlags
+            Flags to use in this input for each motionstep
+        sbt_record_offset_buffer: ndarray, optional
+            Offsets into the ShaderBindingTable record for each primitive (index) or None
+        pre_transform: ndarray(3,4) or None
+            A transform to apply prior to processing
+        primitive_index_offset: int
+            The offset applied to the primitive index in device code
+        """
+    def __init__(self,
+                 vertex_buffers,
+                 radius_buffers,
+                 num_sbt_records = 1,
+                 flags = None,
+                 sbt_record_offset_buffer = None,
+                 pre_transform = None,
+                 primitive_index_offset = 0
+                 ):
+        super().__init__(BuildInputType.SPHERES)
+        self._d_vertex_buffers = [cp.asarray(vb) for vb in ensure_iterable(vertex_buffers)]
+        self._d_vertex_buffer_ptrs.reserve(len(self._d_vertex_buffers))
+
+        self._d_radius_buffers = [cp.asarray(vb) for vb in ensure_iterable(radius_buffers)]
+        self._d_radius_buffer_ptrs.reserve(len(self._d_radius_buffers))
+
+        if len(self._d_radius_buffers) != len(self._d_vertex_buffers):
+            raise ValueError("Argument radius_buffers must have the same number of arrays as vertex_buffers.")
+
+        if len(self._d_vertex_buffers) == 0:
+            raise ValueError("BuildInputSphereArray cannot be empty.")
+
+        dtype = self._d_vertex_buffers[0].dtype
+        shape = self._d_vertex_buffers[0].shape
+        strides = self._d_vertex_buffers[0].strides
+
+        radius_dtype = self._d_radius_buffers[0].dtype
+        radius_shape = self._d_radius_buffers[0].shape
+        strides = self._d_radius_buffers[0].strides
+
+        for vb, rb in zip(self._d_vertex_buffers, self._d_radius_buffers):
+            if vb.dtype != dtype or vb.shape != shape or vb.strides != strides:
+                raise ValueError("All vertex buffers must have the same size and dtype.")
+            self._d_vertex_buffer_ptrs.push_back(vb.data.ptr)
+
+            if rb.dtype != dtype or rb.shape != shape or rb.strides != strides:
+                raise ValueError("All radius buffers must have the same size and dtype.")
+            self._d_radius_buffer_ptrs.push_back(rb.data.ptr)
+
+        self.build_input.vertexBuffers = self._d_vertex_buffer_ptrs.const_data()
+        self.build_input.radiusBuffers = self._d_radius_buffer_ptrs.const_data()
+
+        self.build_input.vertexStrideInBytes = self._d_vertex_buffers[0].strides[0]
+        self.build_input.radiusStrideInBytes = self._d_radius_buffers[0].strides[0]
+
+        self.build_input.numVertices = shape[0]
+        self.build_input.singleRadius = 1 if self._d_radius_buffers[0].shape[0] == 1 else 0
+
+        self.build_input.numSbtRecords = num_sbt_records
+        self._flags.resize(num_sbt_records)
+
+        if flags is None:
+            for i in range(num_sbt_records):
+                self._flags[i] = OPTIX_GEOMETRY_FLAG_NONE
+        else:
+            for i in range(num_sbt_records):
+                self._flags[i] = flags[i].value
+
+        self.build_input.flags = self._flags.data()
+
+
+        if sbt_record_offset_buffer is not None:
+            self._d_sbt_offset_buffer = cp.asarray(sbt_record_offset_buffer).ravel()
+            self.build_input.sbtIndexOffsetBuffer = self._d_sbt_offset_buffer.data.ptr
+            itemsize = self._d_sbt_offset_buffer.itemsize
+            if itemsize > 4:
+                raise ValueError("Only 32 bit allowed at max")
+            self.build_input.sbtIndexOffsetSizeInBytes = itemsize
+            self.build_input.sbtIndexOffsetStrideInBytes = self._d_sbt_offset_buffer.strides[0]
+        else:
+            self.build_input.sbtIndexOffsetBuffer = 0
+            self.build_input.sbtIndexOffsetStrideInBytes = 0
+            self.build_input.sbtIndexOffsetSizeInBytes = 0
+
+        self.build_input.primitiveIndexOffset = primitive_index_offset
+
+    cdef void prepare_build_input(self, OptixBuildInput * build_input) except *:
+        build_input.type = self.build_input_type
+        build_input.sphereArray = self.build_input
+
+    cdef size_t num_elements(self):
+        return self.build_input.numVertices
+
+
 cdef class Instance(OptixObject):
     """
     Class representing a single instance (another AccelerationStructure) for use in a Instance level AccelerationStructure.
@@ -483,14 +620,17 @@ cdef class Instance(OptixObject):
         # update the handle as well
         self.instance.traversableHandle = self.traversable.handle
 
-    def __deepcopy__(self, memodict={}):
-        from copy import deepcopy
+    def relocate(self,
+                 device: typ.Optional[DeviceContext] = None,
+                 stream: typ.Optional[cp.cuda.Stream] = None):
         cls = self.__class__
         result = cls.__new__(cls)
-        memodict[id(self)] = result
         result.instance = self.instance
-        result.traversable = deepcopy(self.traversable)
+        result.traversable = self.traversable.relocate(device=device, stream=stream)
 
+    def __deepcopy__(self, memo):
+        result = self.relocate()
+        memo[id(self)] = result
         return result
 
     @property
@@ -516,6 +656,7 @@ cdef class BuildInputInstanceArray(BuildInputArray):
         A list of the Instances to use as input
     """
     def __init__(self, instances):
+        super().__init__(BuildInputType.INSTANCES)
         instances = ensure_iterable(instances)
         self.instances = instances
 
@@ -532,7 +673,7 @@ cdef class BuildInputInstanceArray(BuildInputArray):
         self.build_input.numInstances = len(instances)
 
     cdef void prepare_build_input(self, OptixBuildInput * build_input) except *:
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES
+        build_input.type = self.build_input_type
         build_input.instanceArray = self.build_input
 
     cdef size_t num_elements(self):
@@ -583,6 +724,92 @@ cdef class BuildInputInstanceArray(BuildInputArray):
         return cp.ndarray(shape=(3,4), dtype=np.float32, memptr=device_ptr)
 
 
+cdef class RelocationDependency:
+    cdef OptixBuildInputType _type
+
+    def __init__(self, type):
+        self._type = <OptixBuildInputType><int>(BuildInputType(type).value)
+
+    @property
+    def type(self):
+        return BuildInputType(self._type)
+
+    cdef RelocationDependency relocate(self, device, stream):
+        return self
+
+    cdef void fill_relocation_input(self, OptixRelocateInput& input):
+        input.type = self._type
+
+    cdef void finalize_relocation_input(self):
+        pass
+
+cdef class RelocationInstanceDependency(RelocationDependency):
+    cdef object instances
+    cdef object d_instances
+
+    def __init__(self, instances):
+        super().__init__(BuildInputType.INSTANCES)
+        self.instances = instances
+        self.d_instances = None
+
+    cdef RelocationInstanceDependency relocate(self, device, stream):
+        relocated_instances = [inst.relocate(device=device, stream=stream) for inst in self.instances]
+        result = self.__class__(relocated_instances)
+        return result
+
+    cdef void fill_relocation_input(self, OptixRelocateInput& input):
+        cdef vector[OptixTraversableHandle] c_instance_handles
+        cdef ssize_t c_instance_handles_size = 0
+        cdef object d_instances
+        cdef size_t i
+        cdef CUdeviceptr d_instances_ptr = 0
+        cdef vector[OptixRelocateInput] c_relocate_inputs
+        cdef size_t num_relocate_inputs
+
+        input.type = self.type
+        input.instanceArray.numInstances = len(self.instances)
+
+        # prepare the new instance handles for relocation by copiing them into a temporary device buffer
+        c_instance_handles.resize(len(self.instances))
+        c_instance_handles_size = sizeof(OptixTraversableHandle) * c_instance_handles.size()
+
+        self.d_instances = cp.cuda.alloc(c_instance_handles_size)
+        for i in range(c_instance_handles.size()):
+            c_instance_handles[i] = self.instances[i].traversable.handle
+
+        d_instances_ptr = self.d_instances.ptr
+        cp.cuda.runtime.memcpy(d_instances_ptr, <size_t> c_instance_handles.data(), c_instance_handles_size,
+                               cp.cuda.runtime.memcpyHostToDevice)
+        c_relocate_inputs[0].instanceArray.traversableHandles = d_instances_ptr
+        input.instanceArray.traversableHandles = d_instances_ptr
+
+    cdef void finalize_relocation_input(self):
+        self.d_instances = None # remove the temporary cuda buffer again
+
+
+cdef class RelocationTriangleDependency(RelocationDependency):
+    cdef unsigned int num_sbt_records
+    cdef OpacityMicromapArray micromap
+
+    def __init__(self, num_sbt_records, micromap=None):
+        super().__init__(BuildInputType.INSTANCES)
+        self.num_sbt_records = num_sbt_records
+        self.micromap = micromap
+
+    cdef RelocationInstanceDependency relocate(self, device, stream):
+        if self.micromap is not None:
+            relocated_micromap = self.micromap.relocate(device=device, stream=stream)
+        else:
+            relocated_micromap = None
+        result = self.__class__(self.num_sbt_records, relocated_micromap)
+        return result
+
+    cdef void fill_relocation_input(self, OptixRelocateInput& input):
+        input.type = self.type
+        input.triangleArray.numSbtRecords = self.num_sbt_records
+        input.triangleArray.opacityMicromap.opacityMicromapArray = self.micromap.d_micromap_array_buffer.ptr
+
+
 cdef class AccelerationStructure(OptixContextObject):
     """
     Class representing a Geometry Acceleration Structure (GAS) or Instance Acceleration Structure (IAS). This wraps the OptixTraversableHandle internally and manages the ressources like
@@ -605,6 +832,10 @@ cdef class AccelerationStructure(OptixContextObject):
         Allow for random access of the vertices in triangle geometry
     random_instance_access: bool
         Allow for random access of the instances if an IAS is built
+    allow_opacity_micromap_update: bool
+        Allows to update the opacity micromaps in this structure
+    allow_disable_opacity_micromaps: bool
+        Allows to disable the opacity micromaps for instances in this structure
     stream: cupy.cuda.Stream, optional
         Cuda stream to use. If None the default stream is used
     """
@@ -616,6 +847,8 @@ cdef class AccelerationStructure(OptixContextObject):
                  prefer_fast_build=False,
                  random_vertex_access=False,
                  random_instance_access=False,
+                 allow_opacity_micromap_update=False,
+                 allow_disable_opacity_micromaps=False,
                  stream=None):
 
         super().__init__(context)
@@ -634,9 +867,15 @@ cdef class AccelerationStructure(OptixContextObject):
             self._build_flags |= OPTIX_BUILD_FLAG_ALLOW_RANDOM_VERTEX_ACCESS
         if random_instance_access:
             self._build_flags |= OPTIX_BUILD_FLAG_ALLOW_RANDOM_INSTANCE_ACCESS
+        if allow_opacity_micromap_update:
+            self._build_flags |= OPTIX_BUILD_FLAG_ALLOW_OPACITY_MICROMAP_UPDATE
+        if allow_disable_opacity_micromaps:
+            self._build_flags |= OPTIX_BUILD_FLAG_ALLOW_DISABLE_OPACITY_MICROMAPS
+
 
         self._gas_buffer = None
         self._instances = None
+        self._relocate_deps = []
         build_inputs = ensure_iterable(build_inputs)
         self.build(build_inputs, stream=stream)
 
@@ -677,14 +916,23 @@ cdef class AccelerationStructure(OptixContextObject):
     cdef void build(self, build_inputs, stream=None):
         # build a single vector from all the build inputs
         cdef size_t inputs_size = len(build_inputs)
-        cdef vector[OptixBuildInput] inputs #= vector[OptixBuildInput](inputs_size)
-
-        self._init_build_inputs(build_inputs, inputs)
+        cdef vector[OptixBuildInput] c_inputs #= vector[OptixBuildInput](inputs_size)
+        cdef size_t i
 
-        if isinstance(build_inputs[0], BuildInputInstanceArray):
-            if inputs_size > 1:
-                raise ValueError("Only a single build input allowed for instance builds")
-            self._instances = (<BuildInputInstanceArray>build_inputs[0]).instances # keep the instances so the buffers do not get deleted
+        self._init_build_inputs(build_inputs, c_inputs)
+
+        for build_input in build_inputs:
+            if isinstance(build_input, BuildInputInstanceArray):
+                relocation_dep = RelocationInstanceDependency((<BuildInputInstanceArray>build_input).instances)
+                if inputs_size > 1:
+                    raise ValueError("Only a single build input allowed for instance builds")
+            elif isinstance(build_input, BuildInputTriangleArray):
+                micromap = <BuildInputTriangleArray>build_input.micromap
+                micromap_array = micromap.micromap_array if micromap is not None else None
+                relocation_dep = RelocationTriangleDependency(build_input.num_sbt_records, micromap=micromap_array)
+            else:
+                relocation_dep = RelocationDependency(build_input.type)
+            self._relocate_deps.append(relocation_dep)
 
         cdef vector[OptixAccelBuildOptions] accel_options# = vector[OptixAccelBuildOptions](inputs_size)
         self._init_accel_options(inputs_size, self._build_flags, OPTIX_BUILD_OPERATION_BUILD, accel_options)
@@ -697,7 +945,7 @@ cdef class AccelerationStructure(OptixContextObject):
 
         optix_check_return(optixAccelComputeMemoryUsage(self.context.c_context,
                                                         accel_options.data(),
-                                                        inputs.data(),
+                                                        c_inputs.data(),
                                                         inputs_size,
                                                         &self._buffer_sizes))
 
@@ -726,7 +974,7 @@ cdef class AccelerationStructure(OptixContextObject):
             optix_check_return(optixAccelBuild(self.context.c_context,
                                                <CUstream>c_stream,
                                                accel_options.data(),
-                                               inputs.data(),
+                                               c_inputs.data(),
                                                inputs_size,
                                                tmp_gas_buffer_ptr,
                                                self._buffer_sizes.tempSizeInBytes,
@@ -770,6 +1018,8 @@ cdef class AccelerationStructure(OptixContextObject):
 
         cdef size_t inputs_size = len(build_inputs)
 
+        if inputs_size != len(self._relocate_deps):
+            raise ValueError("Number of build inputs given to update() must be the same as the one used to build this GAS")
         cdef vector[OptixBuildInput] inputs #= vector[OptixBuildInput](inputs_size)
         self._init_build_inputs(build_inputs, inputs)
 
@@ -802,30 +1052,50 @@ cdef class AccelerationStructure(OptixContextObject):
                                                NULL,
                                                0))
 
-    def __deepcopy__(self, memodict={}):
+    def __deepcopy__(self, memo):
+        """
+        Perform a deep copy of the AccelerationStructure by using the standard python copy.deepcopy function.
         """
-        Perform a deep copy of the AccelerationStructure by using the standard python copy.deepcopy function. This method
-        will also handle any necessary relocation tasks required by optiX on a copy.
+        # relocate on the same device to perform a regular deep copy
+        result = self.relocate(device=None)
+        memo[id(self)] = result
+        return result
+
+
+    def relocate(self,
+                 device: typ.Optional[DeviceContext] = None,
+                 stream: typ.Optional[cp.cuda.Stream] = None):
+        """
+        Relocate this acceleration structure into another copy. Usually this is equivalent to a deep copy.
+        Additionally, the accleration structure can be copied to a different defice by specifying the device
+        parameter with a different DeviceContext.
 
         Parameters
         ----------
-        memodict
-
+        device:
+            An optional DeviceContext. The resulting copy of the acceleration structure will be copied
+            to this device. If None, the acceleration structure's current device is used.
+        stream:
+            The stream to use for the relocation. If None, the default stream (0) is used.
         Returns
         -------
         copy: AccelerationStructure
-            The copy of the AccelerationStructure
+            The copy of the AccelerationStructure on the new device
 
         """
         from copy import deepcopy
         # relocate the optix structure
-        cdef OptixAccelRelocationInfo gas_info
-        memset(&gas_info, 0, sizeof(OptixAccelRelocationInfo)) # init struct to 0
+        cdef OptixRelocationInfo gas_info
+        memset(&gas_info, 0, sizeof(OptixRelocationInfo)) # init struct to 0
 
         optix_check_return(optixAccelGetRelocationInfo(self.context.c_context, self._handle, &gas_info))
 
+        if device is None:
+            device = self.context
+
+        # check if the new device is compatible with this acceleration structure
         cdef int compatible = 0
-        optix_check_return(optixAccelCheckRelocationCompatibility(self.context.c_context,
+        optix_check_return(optixCheckRelocationCompatibility(<OptixDeviceContext>(<DeviceContext>device).c_context,
                                                                   &gas_info,
                                                                   &compatible))
         if compatible != 1:
@@ -835,47 +1105,53 @@ cdef class AccelerationStructure(OptixContextObject):
         cls = self.__class__
         cdef AccelerationStructure result = cls.__new__(cls)
 
-        memodict[id(self)] = result
-
-        result.context = self.context
+        result.context = device
         result._build_flags = self._build_flags
         result._buffer_sizes = self._buffer_sizes
-        result._instances = deepcopy(self._instances) # copy all instances and their AccelerationStructures first
-    
+
+        #if self._instances is not None:
+        #    result._instances = [inst.relocate(device=device, stream=stream) for inst in self._instances] # copy all instances and their AccelerationStructures first
+
         buffer_size = round_up(self._buffer_sizes.outputSizeInBytes, 8) + 8
         result._gas_buffer = cp.cuda.alloc(buffer_size)
         cp.cuda.runtime.memcpy(result._gas_buffer.ptr, self._gas_buffer.ptr, buffer_size, cp.cuda.runtime.memcpyDeviceToDevice)
 
-        cdef vector[OptixTraversableHandle] c_instance_handles
-        cdef ssize_t c_instance_handles_size = 0
-        cdef object d_instances
-        cdef size_t i
-        cdef CUdeviceptr d_instances_ptr = 0
-
-        if result._instances is not None:
-            # prepare the new instance handles for relocation
-            c_instance_handles.resize(len(result._instances))
-            c_instance_handles_size = sizeof(OptixTraversableHandle) * c_instance_handles.size()
-            d_instances = cp.cuda.alloc(c_instance_handles_size)
-            for i in range(c_instance_handles.size()):
-                c_instance_handles[i] = result._instances[i].traversable.handle
-            d_instances_ptr = d_instances.ptr
-            cp.cuda.runtime.memcpy(d_instances_ptr, <size_t>c_instance_handles.data(), c_instance_handles_size, cp.cuda.runtime.memcpyHostToDevice)
+        #cdef vector[OptixTraversableHandle] c_instance_handles
+        #cdef ssize_t c_instance_handles_size = 0
+        #cdef object d_instances
+        #cdef size_t i
+        #cdef CUdeviceptr d_instances_ptr = 0
+        cdef vector[OptixRelocateInput] c_relocate_inputs
+        #cdef size_t num_relocate_inputs
+        c_relocate_inputs.resize(len(self._relocate_deps))
+
+        # prepare to relocate the dependencies (micromaps and instances)
+        result._relocate_deps = []
+        for i, dep in enumerate(self._relocate_deps):
+            relocated_dep = dep.relocate(device, stream)
+            result._relocate_deps.append(relocated_dep)
+            relocated_dep.fill_relocation_input(c_relocate_inputs[i])
 
         result._handle = 0
-
         cdef uintptr_t c_stream = 0
+
+        if stream is not None:
+            c_stream = stream.ptr
+
         cdef OptixTraversableHandle c_handle = 0
         optix_check_return(optixAccelRelocate(result.context.c_context,
                                               <CUstream>c_stream,
                                               &gas_info,
-                                              d_instances_ptr,
-                                              c_instance_handles_size,
+                                              &c_relocate_inputs[0],
+                                              c_relocate_inputs.size(),
                                               result._gas_buffer,
                                               self._buffer_sizes.outputSizeInBytes,
                                               &c_handle))
         result._handle = c_handle
 
+        for dep in result._relocate_deps:
+            dep.finalize_relocation_input()
+
         return result
 
     @property
diff --git a/optix/context.pyx b/optix/context.pyx
index 0049179..a2d484a 100644
--- a/optix/context.pyx
+++ b/optix/context.pyx
@@ -4,9 +4,9 @@ from .common cimport optix_check_return, optix_init
 from libc.stdint cimport uintptr_t, int32_t
 import cupy as cp
 
-optix_init()
+__all__ = ['optix_version', 'DeviceContext']
 
-OPTIX_VERSION = _OPTIX_VERSION
+optix_init()
 
 def optix_version():
     return _OPTIX_VERSION_MAJOR, _OPTIX_VERSION_MINOR, _OPTIX_VERSION_MICRO
@@ -144,13 +144,13 @@ cdef class DeviceContext(OptixObject):
         """
         The callback function for logging
         """
-        return self._log_callback_function
+        return self._log_callback
 
     @log_callback.setter
     def log_callback(self, object log_callback_function):
-        self._log_callback_function = log_callback_function
-        if self._log_callback_function is not None:
-            optix_check_return(optixDeviceContextSetLogCallback(self.c_context, context_log_cb, <void*>self._log_callback_function, self._log_callback_level))
+        self._log_callback = log_callback_function
+        if self._log_callback is not None:
+            optix_check_return(optixDeviceContextSetLogCallback(self.c_context, context_log_cb, <void*>self._log_callback, self._log_callback_level))
 
     @property
     def log_callback_level(self):
diff --git a/optix/denoiser.pxd b/optix/denoiser.pxd
index 5172fce..661da14 100644
--- a/optix/denoiser.pxd
+++ b/optix/denoiser.pxd
@@ -6,20 +6,14 @@ from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 
 cdef extern from "optix_includes.h" nogil:
-    IF _OPTIX_VERSION > 70300:
-        cdef enum OptixDenoiserModelKind:
-            OPTIX_DENOISER_MODEL_KIND_LDR
-            OPTIX_DENOISER_MODEL_KIND_HDR
-            OPTIX_DENOISER_MODEL_KIND_AOV
-            OPTIX_DENOISER_MODEL_KIND_TEMPORAL
-            OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV
-    ELSE:
-        cdef enum OptixDenoiserModelKind:
-            OPTIX_DENOISER_MODEL_KIND_LDR
-            OPTIX_DENOISER_MODEL_KIND_HDR
-            OPTIX_DENOISER_MODEL_KIND_AOV
-            OPTIX_DENOISER_MODEL_KIND_TEMPORAL
-
+    cdef enum OptixDenoiserModelKind:
+        OPTIX_DENOISER_MODEL_KIND_LDR
+        OPTIX_DENOISER_MODEL_KIND_HDR
+        OPTIX_DENOISER_MODEL_KIND_AOV
+        OPTIX_DENOISER_MODEL_KIND_TEMPORAL
+        OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV
+        OPTIX_DENOISER_MODEL_KIND_UPSCALE2X
+        OPTIX_DENOISER_MODEL_KIND_TEMPORAL_UPSCALE2X
 
     cdef struct OptixDenoiserOptions:
         unsigned int guideAlbedo
@@ -30,12 +24,22 @@ cdef extern from "optix_includes.h" nogil:
         size_t  withOverlapScratchSizeInBytes
         size_t  withoutOverlapScratchSizeInBytes
         unsigned int overlapWindowSizeInPixels
+        size_t    computeAverageColorSizeInBytes
+        size_t    computeIntensitySizeInBytes
+        size_t    internalGuideLayerPixelSizeInBytes
+
+    cdef enum OptixDenoiserAlphaMode:
+        OPTIX_DENOISER_ALPHA_MODE_COPY,
+        OPTIX_DENOISER_ALPHA_MODE_ALPHA_AS_AOV,
+        OPTIX_DENOISER_ALPHA_MODE_FULL_DENOISE_PASS
 
     cdef struct OptixDenoiserParams:
-        unsigned int denoiseAlpha
+        OptixDenoiserAlphaMode denoiseAlpha
         CUdeviceptr  hdrIntensity
         float        blendFactor
         CUdeviceptr  hdrAverageColor
+        unsigned int temporalModeUsePreviousLayers
+
 
     cdef enum OptixPixelFormat:
         OPTIX_PIXEL_FORMAT_HALF2
@@ -64,6 +68,8 @@ cdef extern from "optix_includes.h" nogil:
         OptixImage2D albedo
         OptixImage2D normal
         OptixImage2D flow
+        OptixImage2D previousOutputInternalGuideLayer
+        OptixImage2D outputInternalGuideLayer
 
     ctypedef struct OptixDenoiser:
         pass
@@ -185,6 +191,9 @@ cdef class Denoiser(OptixContextObject):
     cdef size_t _state_size
     cdef object _d_scratch
     cdef size_t _scratch_size
+    cdef size_t _guide_layer_scratch_size
+    cdef size_t _intensity_scratch_size
+    cdef size_t _average_color_scratch_size
     cdef object _d_window
     cdef size_t _window_size
     cdef object _d_intensity
diff --git a/optix/denoiser.pyx b/optix/denoiser.pyx
index a751dea..7dc9f74 100644
--- a/optix/denoiser.pyx
+++ b/optix/denoiser.pyx
@@ -1,4 +1,5 @@
 # distutils: language = c++
+import enum
 
 from .common cimport optix_check_return, optix_init
 from .context cimport DeviceContext
@@ -7,13 +8,21 @@ import numpy as np
 from enum import IntEnum
 from libcpp.vector cimport vector
 from .common import ensure_iterable
+from typing import Optional
 
 optix_init()
 
 __all__ = ['DenoiserModelKind',
-           'Denoiser'
+           'Denoiser',
+           'DenoiserAlphaMode'
            ]
 
+class DenoiserAlphaMode(enum.IntEnum):
+    COPY = OPTIX_DENOISER_ALPHA_MODE_COPY
+    ALPHA_AS_AOV = OPTIX_DENOISER_ALPHA_MODE_ALPHA_AS_AOV
+    FULL_DENOISE_PASS = OPTIX_DENOISER_ALPHA_MODE_FULL_DENOISE_PASS
+
+
 class DenoiserModelKind(IntEnum):
     """
     Wraps the OptixDenoiserModelKind enum.
@@ -22,15 +31,14 @@ class DenoiserModelKind(IntEnum):
     HDR = OPTIX_DENOISER_MODEL_KIND_HDR
     AOV = OPTIX_DENOISER_MODEL_KIND_AOV
     TEMPORAL = OPTIX_DENOISER_MODEL_KIND_TEMPORAL
-
-    IF _OPTIX_VERSION > 70300:
-        TEMPORAL_AOV = OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV
+    TEMPORAL_AOV = OPTIX_DENOISER_MODEL_KIND_TEMPORAL_AOV
+    UPSCALE2X = OPTIX_DENOISER_MODEL_KIND_UPSCALE2X
+    TEMPORAL_UPSCALE2X = OPTIX_DENOISER_MODEL_KIND_TEMPORAL_UPSCALE2X
 
     def temporal_mode(self):
-        IF _OPTIX_VERSION > 70300:
-            return self == self.TEMPORAL or self == self.TEMPORAL_AOV
-        ELSE:
-            return self == self.TEMPORAL
+        return self == self.TEMPORAL or \
+               self==self.TEMPORAL_AOV or \
+               self == self.TEMPORAL_UPSCALE2X
 
 
 class PixelFormat(IntEnum):
@@ -41,8 +49,6 @@ class PixelFormat(IntEnum):
     FLOAT3 = OPTIX_PIXEL_FORMAT_FLOAT3
     FLOAT4 = OPTIX_PIXEL_FORMAT_FLOAT4
 
-
-
     @classmethod
     def from_dtype_size(cls, dtype, size):
         try:
@@ -144,6 +150,10 @@ cdef class Denoiser(OptixContextObject):
         self.kp_mode = kp_mode
         self.tile_size = tile_size
         self._scratch_size = 0
+
+        self._guide_layer_scratch_size = 0
+        self._intensity_scratch_size = 0
+        self._average_color_scratch_size = 0
         self._state_size = 0
 
         if model_kind is not None:
@@ -188,6 +198,9 @@ cdef class Denoiser(OptixContextObject):
             self._state_size = return_sizes.stateSizeInBytes
             self._d_state = cp.cuda.alloc(return_sizes.stateSizeInBytes)
 
+        self._intensity_scratch_size = return_sizes.computeIntensitySizeInBytes
+        self._average_color_scratch_size = return_sizes.computeAverageColorSizeInBytes
+
         cdef uintptr_t c_stream = 0
 
         if stream is not None:
@@ -203,8 +216,6 @@ cdef class Denoiser(OptixContextObject):
             self._d_scratch.ptr,
             self._scratch_size))
 
-
-
     @classmethod
     def create_with_user_model(cls, DeviceContext context, unsigned char[::1] user_model not None):
         obj = cls(context, model_kind=None)
@@ -221,9 +232,10 @@ cdef class Denoiser(OptixContextObject):
                normals=None,
                flow=None,
                outputs=None,
-               denoise_alpha=False,
+               denoise_alpha: DenoiserAlphaMode = DenoiserAlphaMode.COPY,
                blend_factor=0.0,
-               stream=None):
+               stream=None,
+               temporal_use_previous_layer=False):
 
         accepted_input_types = (PixelFormat.FLOAT3, PixelFormat.FLOAT3, PixelFormat.HALF3, PixelFormat.HALF4)
         inputs = [Image2D(inp, require_type=accepted_input_types) for inp in ensure_iterable(inputs)]
@@ -284,11 +296,12 @@ cdef class Denoiser(OptixContextObject):
         self._init_denoiser(len(inputs), input_size, stream=stream)
 
         cdef OptixDenoiserParams params
-        params.denoiseAlpha = 1 if denoise_alpha else 0
         params.hdrIntensity = <CUdeviceptr>self._d_intensity.ptr if self._d_intensity is not None else 0
         params.hdrAverageColor = <CUdeviceptr>self._d_avg_color.ptr if self._d_avg_color is not None else 0
         params.blendFactor = blend_factor
 
+        params.temporalModeUsePreviousLayers = 1 if temporal_use_previous_layer and temporal_mode else 0
+        params.denoiseAlpha = <OptixDenoiserAlphaMode>denoise_alpha.value
 
         cdef uintptr_t c_stream = 0
 
@@ -297,13 +310,14 @@ cdef class Denoiser(OptixContextObject):
 
         # determinhe intensity and avg color if needed
         if self._d_intensity is not None:
+
             optix_check_return(optixDenoiserComputeIntensity(
                 self.denoiser,
                 <CUstream>c_stream,
                 &layers[0].input,
                 self._d_intensity.ptr,
                 self._d_scratch.ptr,
-                self._scratch_size))
+                self._intensity_scratch_size))
 
         if self._d_avg_color is not None:
             optix_check_return(optixDenoiserComputeAverageColor(
@@ -312,7 +326,7 @@ cdef class Denoiser(OptixContextObject):
                 &layers[0].input,
                 self._d_avg_color,
                 self._d_scratch.ptr,
-                self._scratch_size))
+                self._average_color_scratch_size))
 
 
         if self.tile_size is None:
diff --git a/optix/micromap.pxd b/optix/micromap.pxd
new file mode 100644
index 0000000..157e485
--- /dev/null
+++ b/optix/micromap.pxd
@@ -0,0 +1,169 @@
+from libc.stdint cimport uint32_t
+from libcpp.vector cimport vector
+from libcpp.pair cimport pair
+from .base cimport OptixObject
+from .common cimport OptixResult, CUstream, CUdeviceptr
+from .context cimport OptixDeviceContext, OptixContextObject
+
+
+cdef extern from "optix_micromap.h" nogil:
+    cdef packed struct float2:
+        float x
+        float y
+
+    void optixMicromapIndexToBaseBarycentrics(uint32_t microTriangleIndex,
+                                              uint32_t subdivisionLevel,
+                                              float2&  baseBarycentrics0,
+                                              float2&  baseBarycentrics1,
+                                              float2&  baseBarycentrics2)
+
+
+cdef extern from "optix.h" nogil:
+    cdef enum OptixOpacityMicromapArrayIndexingMode:
+        OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_NONE,
+        OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_LINEAR,
+        OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_INDEXED
+
+
+    cdef enum OptixOpacityMicromapFlags:
+        OPTIX_OPACITY_MICROMAP_FLAG_NONE,
+        OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_TRACE,
+        OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_BUILD
+
+
+    cdef enum OptixOpacityMicromapFormat:
+        OPTIX_OPACITY_MICROMAP_FORMAT_NONE,
+        OPTIX_OPACITY_MICROMAP_FORMAT_2_STATE  # 0: Transparent, 1: Opaque
+        OPTIX_OPACITY_MICROMAP_FORMAT_4_STATE  # 0: Transparent, 1: Opaque, 2: Unknown-Transparent, 3: Unknown-Opaque
+
+
+    cdef struct OptixOpacityMicromapHistogramEntry:
+        unsigned int count
+        unsigned int subdivisionLevel
+        OptixOpacityMicromapFormat format
+
+
+    cdef struct OptixOpacityMicromapUsageCount:
+        unsigned int count
+        unsigned int subdivisionLevel
+        OptixOpacityMicromapFormat format
+
+
+    cdef struct OptixOpacityMicromapDesc:
+        unsigned int byteOffset
+        unsigned short subdivisionLevel
+        unsigned short format
+
+
+    # get the defines for the micromap state as constant variables to access them from cython
+    cdef const unsigned char OPTIX_OPACITY_MICROMAP_STATE_TRANSPARENT_DEFINE "OPTIX_OPACITY_MICROMAP_STATE_TRANSPARENT"  # = 0
+    cdef const unsigned char OPTIX_OPACITY_MICROMAP_STATE_OPAQUE_DEFINE "OPTIX_OPACITY_MICROMAP_STATE_OPAQUE"  # = 1
+    cdef const unsigned char OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_TRANSPARENT_DEFINE "OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_TRANSPARENT"  # = 2
+    cdef const unsigned char OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_OPAQUE_DEFINE "OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_OPAQUE"  # = 3
+
+    cdef const unsigned long long OPTIX_OPACITY_MICROMAP_ARRAY_BUFFER_BYTE_ALIGNMENT "OPTIX_OPACITY_MICROMAP_ARRAY_BUFFER_BYTE_ALIGNMENT"
+
+    cdef const int OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_TRANSPARENT_DEFINE "OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_TRANSPARENT"  # = 3
+    cdef const int OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_OPAQUE_DEFINE "OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_OPAQUE"  # = 3
+    cdef const int OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_TRANSPARENT_DEFINE "OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_TRANSPARENT"  # = 3
+    cdef const int OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_OPAQUE_DEFINE "OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_OPAQUE"  # = 3
+
+
+
+    cdef struct OptixOpacityMicromapArrayBuildInput:
+        OptixOpacityMicromapFlags flags
+        CUdeviceptr inputBuffer
+        CUdeviceptr perMicromapDescBuffer
+        unsigned int perMicromapDescStrideInBytes
+        unsigned int numMicromapHistogramEntries
+        const OptixOpacityMicromapHistogramEntry * micromapHistogramEntries
+
+
+    cdef struct OptixBuildInputOpacityMicromap:
+        OptixOpacityMicromapArrayIndexingMode indexingMode
+        CUdeviceptr opacityMicromapArray
+        CUdeviceptr indexBuffer
+        unsigned int indexSizeInBytes
+        unsigned int indexStrideInBytes
+        unsigned int indexOffset
+        unsigned int numMicromapUsageCounts
+        const OptixOpacityMicromapUsageCount * micromapUsageCounts
+
+
+    cdef struct OptixMicromapBufferSizes:
+        size_t outputSizeInBytes
+        size_t tempSizeInBytes
+
+
+    cdef struct OptixMicromapBuffers:
+        CUdeviceptr output
+        size_t outputSizeInBytes
+        CUdeviceptr temp
+        size_t tempSizeInBytes
+
+
+    cdef struct OptixRelocateInputOpacityMicromap:
+        CUdeviceptr opacityMicromapArray
+
+
+    cdef struct OptixRelocationInfo:
+        unsigned long long info[4]
+
+
+    OptixResult optixOpacityMicromapArrayBuild(OptixDeviceContext context,
+                                               CUstream stream,
+                                               const OptixOpacityMicromapArrayBuildInput * buildInput,
+                                               const OptixMicromapBuffers * buffers)
+
+
+    OptixResult optixOpacityMicromapArrayComputeMemoryUsage(OptixDeviceContext context,
+                                                            const OptixOpacityMicromapArrayBuildInput * buildInput,
+                                                            OptixMicromapBufferSizes * bufferSizes)
+
+
+    OptixResult optixOpacityMicromapArrayGetRelocationInfo(OptixDeviceContext context,
+                                                           CUdeviceptr opacityMicromapArray,
+                                                           OptixRelocationInfo * info)
+
+
+    OptixResult optixOpacityMicromapArrayRelocate(OptixDeviceContext context,
+                                                  CUstream stream,
+                                                  const OptixRelocationInfo * info,
+                                                  CUdeviceptr targetOpacityMicromapArray,
+                                                  size_t targetOpacityMicromapArraySizeInBytes)
+
+
+    OptixResult optixCheckRelocationCompatibility(OptixDeviceContext context,
+                                           const OptixRelocationInfo * info,
+                                           int * compatible
+                                           )
+# cdef extern from "<functional>" namespace "std" nogil:
+#     cdef cppclass std_hash "hash"[T]:
+#         function() except +
+#         bint operator()(const T&) const
+
+
+
+cdef class OpacityMicromapInput(OptixObject):
+    cdef object buffer
+    cdef OptixOpacityMicromapFormat c_format
+    cdef unsigned int c_subdivision_level
+
+
+cdef class OpacityMicromapArray(OptixContextObject):
+    cdef object d_micromap_array_buffer
+    cdef OptixOpacityMicromapFlags _build_flags
+    cdef size_t _buffer_size
+    cdef unsigned int c_num_micromaps
+    cdef tuple _micromap_types
+
+    cdef void build(self, inputs, stream=*)
+
+
+cdef class BuildInputOpacityMicromap(OptixObject):
+    cdef OptixBuildInputOpacityMicromap build_input
+    cdef OpacityMicromapArray c_micromap_array
+    cdef object _d_index_buffer
+    cdef object _usage_counts
+    cdef vector[OptixOpacityMicromapUsageCount] c_usage_counts
+
diff --git a/optix/micromap.pyx b/optix/micromap.pyx
new file mode 100644
index 0000000..fa237cd
--- /dev/null
+++ b/optix/micromap.pyx
@@ -0,0 +1,541 @@
+# distutils: language = c++
+
+import numpy as np
+cimport numpy as np
+np.import_array()
+
+import cupy as cp
+from .common cimport optix_check_return, optix_init
+
+cimport cython
+from cython.operator import dereference
+from libc.stdint cimport uint8_t, uint16_t, uint32_t, uintptr_t
+from libcpp cimport bool
+from libcpp.vector cimport vector
+from libc.string cimport memset
+from collections import defaultdict, namedtuple
+from collections.abc import Mapping, Sequence
+from enum import IntEnum, IntFlag
+import typing as typ
+from .common import ensure_iterable
+from .context cimport DeviceContext
+
+optix_init()
+
+__all__ = ['micromap_indices_to_base_barycentrics',
+           'OpacityMicromapFormat',
+           'OpacityMicromapState',
+           'OpacityMicromapInput',
+           'OpacityMicromapArray',
+           'OpacityMicromapArrayIndexingMode',
+           'BuildInputOpacityMicromap']
+
+
+cdef bool valid_subdivision_level(uint8_t[:, :] opacity):
+    return (np.log2(opacity.shape[1]) / 2).is_integer()
+
+cdef bool is_baked(uint8_t[:, :] opacity):
+    return opacity[0, 0] > 3
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)   # Deactivate negative indexing.
+def micromap_indices_to_base_barycentrics(uint32_t[:] indices, uint32_t subdivision_level = 0):
+    """
+    Converts micromap triangle indices to three base-triangle barycentric coordinates of the micro triangle vertices.
+    The base-triangle is the triangle that the micromap is applied to.
+
+    Parameters
+    ----------
+    indices: Indices of the micro triangles within a micromap.
+    subdivision_level: Subdivision level of the micromap.
+
+    Returns
+    -------
+    base_barycentrics_0: Barycentric coordinates in the space of the base triangle of vertex 0 of the micro triangle.
+    base_barycentrics_1: Barycentric coordinates in the space of the base triangle of vertex 1 of the micro triangle.
+    base_barycentrics_2: Barycentric coordinates in the space of the base triangle of vertex 2 of the micro triangle.
+    """
+    cdef Py_ssize_t num_indices = indices.shape[0]
+
+    barycentrics_0 = np.empty((num_indices, 2), dtype=np.float32)
+    barycentrics_1 = np.empty((num_indices, 2), dtype=np.float32)
+    barycentrics_2 = np.empty((num_indices, 2), dtype=np.float32)
+
+    cdef float[:, ::1] barycentrics_0_view = barycentrics_0
+    cdef float[:, ::1] barycentrics_1_view = barycentrics_1
+    cdef float[:, ::1] barycentrics_2_view = barycentrics_2
+
+    cdef unsigned int i
+    with nogil:
+        for i in range(num_indices):
+            optixMicromapIndexToBaseBarycentrics(indices[i],
+                                                 subdivision_level,
+                                                 dereference(<float2*>&barycentrics_0_view[i, 0]),
+                                                 dereference(<float2*>&barycentrics_1_view[i, 0]),
+                                                 dereference(<float2*>&barycentrics_2_view[i, 0]))
+
+    return barycentrics_0, barycentrics_1, barycentrics_2
+
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)   # Deactivate negative indexing.
+@cython.cdivision(True)
+def bake_opacity_micromap(uint8_t[:, :] opacity, format = None):
+    cdef Py_ssize_t num_tris = opacity.shape[0]
+    cdef Py_ssize_t num_micro_tris = opacity.shape[1]
+
+    if not valid_subdivision_level(opacity):
+        raise ValueError(f"Shape of input ({opacity.shape[1]}) does "
+                         f"not correspond to a valid subdivision level")
+
+    cdef uint8_t bits_per_state
+
+    if format is None:
+        if np.any(np.ravel(opacity) > 1):
+            bits_per_state = 2
+        else:
+            bits_per_state = 1
+    else:
+        bits_per_state = OpacityMicromapFormat(format).value
+
+    # create the array to bake the opacities into
+    opacity_baked = np.zeros((opacity.shape[0], opacity.shape[1] // 16 * bits_per_state), dtype=np.uint16)
+    cdef unsigned int bake_stride = 16 // bits_per_state
+
+    cdef uint16_t[:, :] opacity_baked_view = opacity_baked
+    cdef unsigned int baked_index
+    cdef unsigned int i, j
+    with nogil:
+        for i in range(num_tris):
+            for j in range(0, num_micro_tris):
+                baked_index = j // bake_stride
+                opacity_baked_view[i, baked_index] |= opacity[i, j] << ((j%bake_stride) * bits_per_state)
+
+    return opacity_baked, OpacityMicromapFormat(bits_per_state)
+
+
+
+
+class OpacityMicromapArrayIndexingMode(IntEnum):
+    NONE = OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_NONE,
+    LINEAR = OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_LINEAR,
+    INDEXED = OPTIX_OPACITY_MICROMAP_ARRAY_INDEXING_MODE_INDEXED
+
+
+class OpacityMicromapFormat(IntEnum):
+    NONE = OPTIX_OPACITY_MICROMAP_FORMAT_NONE, # invalid format
+    TWO_STATE = OPTIX_OPACITY_MICROMAP_FORMAT_2_STATE  # 0: Transparent, 1: Opaque
+    FOUR_STATE = OPTIX_OPACITY_MICROMAP_FORMAT_4_STATE  # 0: Transparent, 1: Opaque, 2: Unknown-Transparent, 3: Unknown-Opaque
+
+
+class OpacityMicromapState(IntEnum):
+    """
+    This enum wraps the OPTIX_OPACITY_MICROMAP_STATE_* defines from optix
+    """
+    TRANSPARENT = OPTIX_OPACITY_MICROMAP_STATE_TRANSPARENT_DEFINE
+    OPAQUE = OPTIX_OPACITY_MICROMAP_STATE_OPAQUE_DEFINE
+    UNKNOWN_TRANSPARENT = OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_TRANSPARENT_DEFINE
+    UNKNOWN_OPAQUE = OPTIX_OPACITY_MICROMAP_STATE_UNKNOWN_OPAQUE_DEFINE
+
+    def bits_per_state(self):
+        """
+        Returns the number of bits (either 1 or 2) required to encode this micromap state.
+        """
+        if self < 2:
+            return 1
+        return 2
+
+    def format(self):
+        """
+        Returns the OpacityMicromapFormat to encode this state. This is either
+        OpacityMicromapFormat.TWO_STATE or OpacityMicromapFormat.FOUR_STATE.
+        """
+        if self < 2:
+            return OpacityMicromapFormat.TWO_STATE
+        return OpacityMicromapFormat.FOUR_STATE
+
+
+class OpacityMicromapPredefinedIndex(IntEnum):
+    """
+    This enum wraps the OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_* defines from optix.
+    It provides the indices required to mark a triangle as either fully opaque or transparent in a
+    OpacityMicromapBuildInput.
+    """
+    FULLY_TRANSPARENT = OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_TRANSPARENT_DEFINE
+    FULLY_OPAQUE = OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_OPAQUE_DEFINE
+    FULLY_UNKNOWN_TRANSPARENT = OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_TRANSPARENT_DEFINE
+    FULLY_UNKNOWN_OPAQUE = OPTIX_OPACITY_MICROMAP_PREDEFINED_INDEX_FULLY_UNKNOWN_OPAQUE_DEFINE
+
+
+OpacityMicromapType = namedtuple("OpacityMicromapType", ["format", "subdivision_level"])
+
+
+cdef class OpacityMicromapInput(OptixObject):
+    """
+    This class is a simple wrapper around an uint8-numpy array that will convert convert it into
+    the format required by the optix opacity micromaps while keeping track of the bit encoding and subdivision
+    level. The class supports inputs in an unbaked format (uint8 array with the values in OpacityMicromapFormat) as
+    well as baked formats (values encoded in either 1 or 2 bits).
+
+    Parameters
+    ----------
+    opacity: The input array in either unbaked or baked format.
+    format: Optional format specifier. Required for baked format, optional for unbaked. Invalid formats are not checked
+            for unbaked inputs.
+    """
+    def __init__(self,
+                 opacity,
+                 format: typ.Optional[OpacityMicromapFormat] = None):
+        buffer = np.atleast_2d(opacity)
+
+        format = OpacityMicromapFormat(format) if format is not None else None
+        if is_baked(buffer):
+            if not format:
+                raise ValueError("Baked input requires a format specification")
+            shape_unbaked = (buffer.dtype.itemsize * 8 * buffer.shape[1]) / format.value
+            subdivision_level = (np.log2(shape_unbaked) / 2)
+
+            if not subdivision_level.is_integer():
+                ValueError(f"Shape of baked input ({opacity.shape[1]}) does "
+                           f"not correspond to a valid subdivision level in given format ({format}).")
+        else:
+            subdivision_level = (np.log2(buffer.shape[1]) / 2)
+            buffer, fmt = bake_opacity_micromap(buffer, format)
+
+            # elif format != fmt:
+            #     raise ValueError(f"Attempt to bake the micromap input resulted in a different format than the given one. "
+            #                      f"{format} != {fmt}.")
+
+        self.buffer = buffer
+        self.c_format = <OptixOpacityMicromapFormat>format.value
+        self.c_subdivision_level = subdivision_level
+
+    @property
+    def format(self):
+        return OpacityMicromapFormat(self.c_format)
+
+    @property
+    def subdivision_level(self):
+        return self.c_subdivision_level
+
+    @property
+    def ntriangles(self):
+        return self.buffer.shape[0]
+
+    @property
+    def nbytes(self):
+        return self.buffer.size * self.buffer.itemsize
+
+    def _repr_details(self):
+        return f"ntriangles={self.ntriangles}, format={self.format.name}, subdivision_level={self.subdivision_level}"
+# ctypedef pair[OptixOpacityMicromapFormat, int] histogram_entry
+#
+# cdef bint histogram_entry_hash(const histogram_entry& s) nogil:
+#     return std_hash[OptixOpacityMicromapFormat]()(s.first) ^ std_hash[int]()(s.second)
+
+
+
+cdef class OpacityMicromapArray(OptixContextObject):
+    """
+    Class representing an array of opacity micromaps on the optix device.
+    This class wraps the internal GPU buffer containing the micromap data and serves to build the structure from
+    one or multiple OpactiyMircomap inputs
+
+    Parameters
+    ----------
+    context:
+        The device context to use.
+    inputs:
+        An iterable of OpacityMicromapInput or numpy ndarrays. All numpy arrays will be converted into
+        OpacityMicroMapInput automatically in this class.
+    flags:
+        A set of OpacityMicromapFlags to use for building this array. If None, the default OpacityMicromapFlags.NONE
+        is used.
+    stream:
+        Cuda stream to use for building this micromap array. If None the default stream is used.
+    """
+    def __init__(self,
+                 context: DeviceContext,
+                 inputs: typ.Iterable[typ.Union[np.ndarray, OpacityMicromapInput]],
+                 prefer_fast_build: bool = False,
+                 stream: typ.Optional[cp.cuda.Stream] = None):
+        super().__init__(context)
+        self.d_micromap_array_buffer = None
+        self._micromap_types = None
+        self._build_flags = OPTIX_OPACITY_MICROMAP_FLAG_NONE
+        if prefer_fast_build:
+            self._build_flags = OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_BUILD
+        else:
+            self._build_flags = OPTIX_OPACITY_MICROMAP_FLAG_PREFER_FAST_TRACE
+        self.build(inputs, stream=stream)
+
+    cdef void build(self, inputs, stream=None):
+        # convert all inputs into the correct format first
+        inputs = [OpacityMicromapInput(inp) if not isinstance(inp, OpacityMicromapInput)
+                  else inp for inp in ensure_iterable(inputs)]
+
+        cdef OptixOpacityMicromapArrayBuildInput build_input
+        build_input.flags = self._build_flags
+
+        cdef size_t inputs_size_in_bytes = 0
+        micromap_counts = defaultdict(lambda: 0)
+        micromap_types = []
+
+        self.c_num_micromaps = 0
+        # build the histogram from the input specifications and convert it into a cpp vector to pass it to the build input
+        for i in inputs:
+            omm_type = OpacityMicromapType(i.format, i.subdivision_level)
+            micromap_counts[omm_type] += i.ntriangles
+            micromap_types.append(omm_type)
+            inputs_size_in_bytes += i.nbytes
+            self.c_num_micromaps += i.ntriangles
+        self._micromap_types = tuple(micromap_types)
+
+        cdef vector[OptixOpacityMicromapHistogramEntry] histogram_entries
+        histogram_entries.resize(len(micromap_counts))
+        build_input.numMicromapHistogramEntries = histogram_entries.size()
+
+        for i, (k, v) in enumerate(micromap_counts.items()):
+            histogram_entries[i].count = v
+            histogram_entries[i].format = <OptixOpacityMicromapFormat>k.format.value
+            histogram_entries[i].subdivisionLevel = k.subdivision_level
+
+        build_input.micromapHistogramEntries = histogram_entries.data()
+        del micromap_counts
+
+        # allocate a buffer to hold all input micromaps and put it's pointer in the build input
+        d_input_buffer = cp.cuda.alloc(inputs_size_in_bytes)
+        build_input.inputBuffer = d_input_buffer.ptr
+
+        cdef unsigned int offset = 0
+        cdef vector[OptixOpacityMicromapDesc] descs
+        cdef uint16_t[:, :] buffer_view
+        cdef unsigned int t
+        cdef unsigned int desc_i = 0;
+
+        descs.resize(self.c_num_micromaps)
+        #TODO use the actual triangles in the input array here!
+        # copy all input data into the device buffer
+        for i, inp in enumerate(inputs):
+            buffer_view = (<OpacityMicromapInput>inp).buffer
+            num_bytes = inp.nbytes
+            cp.cuda.runtime.memcpy(d_input_buffer.ptr + offset,
+                                   <uintptr_t>&buffer_view[0,0],
+                                   <size_t>num_bytes,
+                                   cp.cuda.runtime.memcpyHostToDevice)
+            for t in range(inp.ntriangles):
+                # fill the descriptor array at the same time with to information in input
+                descs[desc_i].byteOffset = offset
+                offset += buffer_view.shape[1] * sizeof(uint16_t)
+                descs[desc_i].subdivisionLevel = <unsigned short>inp.subdivision_level
+                descs[desc_i].format = <unsigned short>inp.format.value
+                desc_i += 1
+
+        # copy the descriptor array onto the device
+        cdef size_t desc_size_in_bytes = descs.size() * sizeof(OptixOpacityMicromapDesc)
+
+        d_desc_buffer = cp.cuda.alloc(desc_size_in_bytes)
+        cp.cuda.runtime.memcpy(d_desc_buffer.ptr, <uintptr_t>descs.data(), desc_size_in_bytes, cp.cuda.runtime.memcpyHostToDevice)
+
+        build_input.perMicromapDescBuffer = d_desc_buffer.ptr
+        build_input.perMicromapDescStrideInBytes = 0
+
+        cdef OptixMicromapBufferSizes build_sizes
+
+        optix_check_return(optixOpacityMicromapArrayComputeMemoryUsage(self.context.c_context,
+                                                                       &build_input,
+                                                                       &build_sizes))
+        # TODO: do we have to align this buffer?
+        self.d_micromap_array_buffer = cp.cuda.alloc(build_sizes.outputSizeInBytes)
+        self._buffer_size = build_sizes.outputSizeInBytes
+
+        d_temp_buffer = cp.cuda.alloc(build_sizes.tempSizeInBytes)
+
+        cdef OptixMicromapBuffers micromap_buffers
+
+        micromap_buffers.tempSizeInBytes = build_sizes.tempSizeInBytes
+        micromap_buffers.temp = d_temp_buffer.ptr
+
+        micromap_buffers.outputSizeInBytes = build_sizes.outputSizeInBytes
+        micromap_buffers.output = self.d_micromap_array_buffer.ptr
+
+        cdef uintptr_t c_stream = 0
+
+        if stream is not None:
+            c_stream = stream.ptr
+        with nogil:
+            optix_check_return(optixOpacityMicromapArrayBuild(self.context.c_context,
+                                                              <CUstream>c_stream,
+                                                              &build_input,
+                                                              &micromap_buffers))
+        # all temporary buffers will be freed automatically here
+
+    @property
+    def types(self):
+        return self._micromap_types
+
+    def __deepcopy__(self, memo):
+        """
+        Perform a deep copy of the OpactiyMicromap by using the standard python copy.deepcopy function.
+        """
+        # relocate on the same device to perform a regular deep copy
+        result = self.relocate(device=None)
+        memo[id(self)] = result
+        return result
+
+    def _repr_details(self):
+        return f"size={self._buffer_size}, nmicromaps={self.c_num_micromaps}"
+
+    def relocate(self,
+                 device: typ.Optional[DeviceContext] = None,
+                 stream: typ.Optional[cp.cuda.Stream] = None) -> OpacityMicromapArray:
+        """
+        Relocate this opacity micromap array into another copy. Usually this is equivalent to a deep copy.
+        Additionally, the micromap array can be copied to a different device by specifying the device
+        parameter with a different DeviceContext.
+
+        Parameters
+        ----------
+        device:
+            An optional DeviceContext. The resulting copy of the micromap array will be copied
+            to this device. If None, the micromap array's current device is used.
+        stream:
+            The stream to use for the relocation. If None, the default stream (0) is used.
+
+        Returns
+        -------
+        copy: OpacityMicromapArray
+            The copy of the OpacityMicromapArray on the new device
+        """
+        # first determine the relocation info for this micromap array
+        cdef OptixRelocationInfo micromap_info
+        memset(&micromap_info, 0, sizeof(OptixRelocationInfo))  # init struct to 0
+
+        optix_check_return(optixOpacityMicromapArrayGetRelocationInfo(self.context.c_context,
+                                                                      self.d_micromap_array_buffer, &micromap_info))
+
+        if device is None:
+            device = self.context
+
+        # check if the new device is compatible with this micromap array
+        cdef int compatible = 0
+        optix_check_return(optixCheckRelocationCompatibility(<OptixDeviceContext>(<DeviceContext>device).c_context,
+                                                                  &micromap_info,
+                                                                  &compatible))
+
+        if compatible != 1:
+            raise RuntimeError("Device is not compatible for relocation of opacity micromap array")
+
+        # do the relocation
+        cls = self.__class__
+        cdef OpacityMicromapArray result = cls.__new__(cls)
+
+        result.context = device
+        result._build_flags = self._build_flags
+        result._buffer_size = self._buffer_size
+        # TODO: align this?
+        result._gas_buffer = cp.cuda.alloc(result._buffer_size)
+        cp.cuda.runtime.memcpy(result.d_micromap_array_buffer.ptr,
+                               self.d_micromap_array_buffer.ptr,
+                               result._buffer_size,
+                               cp.cuda.runtime.memcpyDeviceToDevice)
+
+        cdef uintptr_t c_stream = 0
+        if stream is not None:
+            c_stream = stream.ptr
+
+        optix_check_return(optixOpacityMicromapArrayRelocate(result.context.c_context,
+                                                             <CUstream>c_stream,
+                                                             &micromap_info,
+                                                             result.d_micromap_array_buffer.ptr,
+                                                             result._buffer_size))
+
+        return result
+
+
+cdef class BuildInputOpacityMicromap(OptixObject):
+    """
+    Build input for an array of micromaps. Inputs of this type can optionally be passed to a
+    BuildInputTriangleArray to use micromaps for it's triangles. Additionally an array of the usage_counts
+    for the OMMs in the Array needs to be passed as a list.
+    If the indexing mode is specified as INDEXED, an additional index buffer containing an index into the omm array or one of
+    the values in OpacityMicromapPredefinedIndex is required.
+
+    Parameters
+    ----------
+    omm_array:
+        The OpacityMicromapArray to use by the triangles.
+    usage_counts:
+        The number of times each omm in the OpacityMicromapArray is used.
+    indexing_mode:
+        The indexing mode the omms should use. Must be one of the values in OpacityMicromapArrayIndexingMode.
+        By default NONE is used, disabling the use of OMMs
+    index_buffer:
+        If the indexing_mode is INDEXED, this additional index buffer, specifiing an omm to use or the default value
+        for each triangle in the geometry is required.
+    """
+    def __init__(self,
+                 OpacityMicromapArray omm_array,
+                 usage_counts: Sequence[int],
+                 indexing_mode: OpacityMicromapArrayIndexingMode = OpacityMicromapArrayIndexingMode.NONE,
+                 index_buffer = None):
+        indexing_mode = OpacityMicromapArrayIndexingMode(indexing_mode)
+
+        if indexing_mode == OpacityMicromapArrayIndexingMode.INDEXED:
+            if index_buffer is None:
+                raise ValueError("index_buffer is required for indexing_mode=INDEXED.")
+            if not any(np.issubdtype(index_buffer.dtype, dt) for dt in (np.uint16, np.uint32)):
+                raise ValueError("index_buffer must be of dtype np.uint16 or np.uint32")
+            self._d_index_buffer = cp.asarray(index_buffer).ravel()
+            # enable the index buffer in the build_input struct
+            self.build_input.indexBuffer = self._d_index_buffer.data.ptr
+            self.build_input.indexOffset = 0
+            self.build_input.indexSizeInBytes = self._d_index_buffer.itemsize
+            self.build_input.indexStrideInBytes = self._d_index_buffer.strides[0]
+        else:
+            self.build_input.indexBuffer = 0
+            self.build_input.indexOffset = 0
+            self.build_input.indexSizeInBytes = 0
+            self.build_input.indexStrideInBytes = 0
+
+        self.build_input.indexingMode = <OptixOpacityMicromapArrayIndexingMode>indexing_mode.value
+        self.c_micromap_array = omm_array
+        self.build_input.opacityMicromapArray = self.c_micromap_array.d_micromap_array_buffer.ptr
+
+        # fill the usage counts vector from the specified usage array
+        # TODO: is there a way to determine the usage count automatically?
+        micromap_types = self.c_micromap_array.types
+        if len(usage_counts) != len(micromap_types):
+            raise IndexError(f"Number of entries in usage_count must be equal to the number of omms in micromap_array. "
+                             f"Expected {len(micromap_types)}), got ({len(usage_counts)})")
+        usage_count_hist = defaultdict(lambda: 0)
+
+        for type, usage_count in zip(micromap_types, usage_counts):
+            usage_count_hist[type] += usage_count
+
+        self._usage_counts = usage_count_hist
+        self.c_usage_counts.resize(len(usage_counts))
+        for i, (k, v) in enumerate(usage_count_hist.items()):
+            self.c_usage_counts[i].count = v
+            self.c_usage_counts[i].format = <OptixOpacityMicromapFormat>k.format.value
+            self.c_usage_counts[i].subdivisionLevel = k.subdivision_level
+
+        self.build_input.micromapUsageCounts = self.c_usage_counts.data()
+        self.build_input.numMicromapUsageCounts = self.c_usage_counts.size()
+
+    @property
+    def usage_counts(self):
+        return self._usage_counts
+
+    @property
+    def types(self):
+        return self.c_micromap_array.types
+
+    @property
+    def micromap_array(self):
+        return self.c_micromap_array
+
+    @property
+    def index_buffer(self):
+        return self._d_index_buffer
\ No newline at end of file
diff --git a/optix/module.pxd b/optix/module.pxd
index 94bb927..e5ba795 100644
--- a/optix/module.pxd
+++ b/optix/module.pxd
@@ -28,60 +28,48 @@ cdef extern from "optix_includes.h" nogil:
         const char* annotation
 
 
-    IF _OPTIX_VERSION > 70300:  # switch to new version
-        cdef size_t OPTIX_COMPILE_DEFAULT_MAX_PAYLOAD_TYPE_COUNT
-
-        cdef enum OptixPayloadSemantics:
-            OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_NONE,
-            OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ,
-            OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_CH_NONE,
-            OPTIX_PAYLOAD_SEMANTICS_CH_READ,
-            OPTIX_PAYLOAD_SEMANTICS_CH_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_CH_READ_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_MS_NONE,
-            OPTIX_PAYLOAD_SEMANTICS_MS_READ,
-            OPTIX_PAYLOAD_SEMANTICS_MS_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_MS_READ_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_AH_NONE,
-            OPTIX_PAYLOAD_SEMANTICS_AH_READ,
-            OPTIX_PAYLOAD_SEMANTICS_AH_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_AH_READ_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_IS_NONE,
-            OPTIX_PAYLOAD_SEMANTICS_IS_READ,
-            OPTIX_PAYLOAD_SEMANTICS_IS_WRITE,
-            OPTIX_PAYLOAD_SEMANTICS_IS_READ_WRITE,
-
-        cdef struct OptixPayloadType:
-            unsigned int numPayloadValues
-            const unsigned int* payloadSemantics
-
-        cdef struct OptixModuleCompileOptions:
-            int maxRegisterCount
-            OptixCompileOptimizationLevel optLevel
-            OptixCompileDebugLevel 	debugLevel
-            const OptixModuleCompileBoundValueEntry* boundValues
-            unsigned int numBoundValues
-            unsigned int numPayloadTypes
-            OptixPayloadType* payloadTypes
-
-        cdef struct OptixBuiltinISOptions:
-            OptixPrimitiveType builtinISModuleType
-            int usesMotionBlur
-            unsigned int buildFlags
-            unsigned int curveEndcapFlags
-    ELSE:
-        cdef struct OptixModuleCompileOptions:
-            int maxRegisterCount
-            OptixCompileOptimizationLevel optLevel
-            OptixCompileDebugLevel 	debugLevel
-            const OptixModuleCompileBoundValueEntry* boundValues
-            unsigned int numBoundValues
-
-        cdef struct OptixBuiltinISOptions:
-            OptixPrimitiveType builtinISModuleType
-            int usesMotionBlur
+    cdef size_t OPTIX_COMPILE_DEFAULT_MAX_PAYLOAD_TYPE_COUNT
+
+    cdef enum OptixPayloadSemantics:
+        OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_NONE,
+        OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ,
+        OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_CH_NONE,
+        OPTIX_PAYLOAD_SEMANTICS_CH_READ,
+        OPTIX_PAYLOAD_SEMANTICS_CH_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_CH_READ_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_MS_NONE,
+        OPTIX_PAYLOAD_SEMANTICS_MS_READ,
+        OPTIX_PAYLOAD_SEMANTICS_MS_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_MS_READ_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_AH_NONE,
+        OPTIX_PAYLOAD_SEMANTICS_AH_READ,
+        OPTIX_PAYLOAD_SEMANTICS_AH_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_AH_READ_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_IS_NONE,
+        OPTIX_PAYLOAD_SEMANTICS_IS_READ,
+        OPTIX_PAYLOAD_SEMANTICS_IS_WRITE,
+        OPTIX_PAYLOAD_SEMANTICS_IS_READ_WRITE,
+
+    cdef struct OptixPayloadType:
+        unsigned int numPayloadValues
+        const unsigned int* payloadSemantics
+
+    cdef struct OptixModuleCompileOptions:
+        int maxRegisterCount
+        OptixCompileOptimizationLevel optLevel
+        OptixCompileDebugLevel 	debugLevel
+        const OptixModuleCompileBoundValueEntry* boundValues
+        unsigned int numBoundValues
+        unsigned int numPayloadTypes
+        OptixPayloadType* payloadTypes
+
+    cdef struct OptixBuiltinISOptions:
+        OptixPrimitiveType builtinISModuleType
+        int usesMotionBlur
+        unsigned int buildFlags
+        unsigned int curveEndcapFlags
 
 
     OptixResult optixModuleCreateFromPTX(OptixDeviceContext context,
@@ -104,52 +92,50 @@ cdef extern from "optix_includes.h" nogil:
                                         OptixModule *builtinModule)
 
 
-    IF _OPTIX_VERSION > 70300:  # switch to new version
-        ctypedef struct OptixTask:
-            pass
+    ctypedef struct OptixTask:
+        pass
 
-        cdef enum OptixModuleCompileState:
-            OPTIX_MODULE_COMPILE_STATE_NOT_STARTED
-            OPTIX_MODULE_COMPILE_STATE_STARTED
-            OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE
-            OPTIX_MODULE_COMPILE_STATE_FAILED
-            OPTIX_MODULE_COMPILE_STATE_COMPLETED
+    cdef enum OptixModuleCompileState:
+        OPTIX_MODULE_COMPILE_STATE_NOT_STARTED
+        OPTIX_MODULE_COMPILE_STATE_STARTED
+        OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE
+        OPTIX_MODULE_COMPILE_STATE_FAILED
+        OPTIX_MODULE_COMPILE_STATE_COMPLETED
 
-        cdef OptixResult optixModuleGetCompilationState(OptixModule module,
-                                                        OptixModuleCompileState * state)
+    cdef OptixResult optixModuleGetCompilationState(OptixModule module,
+                                                    OptixModuleCompileState * state)
 
-        cdef OptixResult optixModuleCreateFromPTXWithTasks(OptixDeviceContext context,
-                                                           const OptixModuleCompileOptions * moduleCompileOptions,
-                                                           const OptixPipelineCompileOptions * pipelineCompileOptions,
-                                                           const char * PTX,
-                                                           size_t PTXsize,
-                                                           char * logString,
-                                                           size_t * logStringSize,
-                                                           OptixModule * module,
-                                                           OptixTask * firstTask)
+    cdef OptixResult optixModuleCreateFromPTXWithTasks(OptixDeviceContext context,
+                                                       const OptixModuleCompileOptions * moduleCompileOptions,
+                                                       const OptixPipelineCompileOptions * pipelineCompileOptions,
+                                                       const char * PTX,
+                                                       size_t PTXsize,
+                                                       char * logString,
+                                                       size_t * logStringSize,
+                                                       OptixModule * module,
+                                                       OptixTask * firstTask)
+
+    cdef OptixResult optixTaskExecute(OptixTask task,
+                                 OptixTask * additionalTasks,
+                                 unsigned int maxNumAdditionalTasks,
+                                 unsigned int *numAdditionalTasksCreated)
 
-        cdef OptixResult optixTaskExecute(OptixTask task,
-                                     OptixTask * additionalTasks,
-                                     unsigned int maxNumAdditionalTasks,
-                                     unsigned int *numAdditionalTasksCreated)
 
 
 cdef class BuiltinISOptions(OptixObject):
     cdef OptixBuiltinISOptions options
 
+
 cdef class Module(OptixContextObject):
     cdef OptixModule module
     cdef list _compile_flags
 
-IF _OPTIX_VERSION > 70300:  # switch to new version
-    cdef class ModuleCompileOptions(OptixObject):
-        cdef OptixModuleCompileOptions compile_options
-        cdef vector[OptixPayloadType] payload_types
-        cdef vector[vector[unsigned int]] payload_values # WTF!
-
-    cdef class Task(OptixObject):
-        cdef OptixTask task
-        cdef Module module
-ELSE:
-    cdef class ModuleCompileOptions(OptixObject):
-        cdef OptixModuleCompileOptions compile_options
\ No newline at end of file
+
+cdef class ModuleCompileOptions(OptixObject):
+    cdef OptixModuleCompileOptions compile_options
+    cdef vector[OptixPayloadType] payload_types
+    cdef vector[vector[unsigned int]] payload_values # WTF!
+
+cdef class Task(OptixObject):
+    cdef OptixTask task
+    cdef Module module
\ No newline at end of file
diff --git a/optix/module.pyx b/optix/module.pyx
index e991838..a5ea10b 100644
--- a/optix/module.pyx
+++ b/optix/module.pyx
@@ -2,7 +2,8 @@
 
 from enum import IntEnum, IntFlag
 import os
-from .path_utility import get_cuda_include_path, get_optix_include_path
+import warnings
+from .path_utility import get_cuda_include_path, get_optix_include_path, get_local_optix_include_path
 from .common cimport optix_check_return, optix_init
 from .context cimport DeviceContext
 from .pipeline cimport PipelineCompileOptions
@@ -14,6 +15,9 @@ from libcpp.vector cimport vector
 
 optix_init()
 
+__all__ = ['Module', 'ModuleCompileOptions', 'CompileOptimizationLevel', 'CompileDebugLevel',
+           'PayloadSemantics', 'Task']
+
 class CompileOptimizationLevel(IntEnum):
     """
     Wraps the OptixCompileOptimizationLevel enum
@@ -24,93 +28,92 @@ class CompileOptimizationLevel(IntEnum):
     LEVEL_2 = OPTIX_COMPILE_OPTIMIZATION_LEVEL_2,
     LEVEL_3 = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3,
 
-IF _OPTIX_VERSION > 70300:
-    class PayloadSemantics(IntFlag):
-        """
-        Wraps the PayloadSemantics enum.
-        """
+class PayloadSemantics(IntFlag):
+    """
+    Wraps the PayloadSemantics enum.
+    """
+
+    DEFAULT = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_CH_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_MS_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_AH_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_IS_READ_WRITE # allow everything as default
+    TRACE_CALLER_NONE = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_NONE,
+    TRACE_CALLER_READ = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ,
+    TRACE_CALLER_WRITE = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_WRITE,
+    TRACE_CALLER_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ_WRITE,
+    CH_NONE = OPTIX_PAYLOAD_SEMANTICS_CH_NONE,
+    CH_READ = OPTIX_PAYLOAD_SEMANTICS_CH_READ,
+    CH_WRITE = OPTIX_PAYLOAD_SEMANTICS_CH_WRITE,
+    CH_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_CH_READ_WRITE,
+    MS_NONE = OPTIX_PAYLOAD_SEMANTICS_MS_NONE,
+    MS_READ = OPTIX_PAYLOAD_SEMANTICS_MS_READ,
+    MS_WRITE = OPTIX_PAYLOAD_SEMANTICS_MS_WRITE,
+    MS_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_MS_READ_WRITE,
+    AH_NONE = OPTIX_PAYLOAD_SEMANTICS_AH_NONE,
+    AH_READ = OPTIX_PAYLOAD_SEMANTICS_AH_READ,
+    AH_WRITE = OPTIX_PAYLOAD_SEMANTICS_AH_WRITE,
+    AH_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_AH_READ_WRITE,
+    IS_NONE = OPTIX_PAYLOAD_SEMANTICS_IS_NONE,
+    IS_READ = OPTIX_PAYLOAD_SEMANTICS_IS_READ,
+    IS_WRITE = OPTIX_PAYLOAD_SEMANTICS_IS_WRITE,
+    IS_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_IS_READ_WRITE
+
+class ModuleCompileState(IntFlag):
+    NOT_STARTED = OPTIX_MODULE_COMPILE_STATE_NOT_STARTED,
+    STARTED = OPTIX_MODULE_COMPILE_STATE_STARTED,
+    IMPENDING_FAILURE = OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE,
+    FAILED = OPTIX_MODULE_COMPILE_STATE_FAILED,
+    COMPLETED = OPTIX_MODULE_COMPILE_STATE_COMPLETED,
+
+
+cdef class Task(OptixObject):
+    """
+    Class to represent a parallel Task to compile an OptiX module.
+    A Task can be executed in parallel by e.g. a thread pool to handle lots of module compilations concurrently.
+    It is only valid as long as the corresponding module exists, therefore in this wrapper a reference to the module
+    if stored.
+
+    Note, that a Task is not supposed to be created by the user directly, but provided by the create_as_task method
+    of the Module class.
 
-        DEFAULT = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_CH_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_MS_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_AH_READ_WRITE | OPTIX_PAYLOAD_SEMANTICS_IS_READ_WRITE # allow everything as default
-        TRACE_CALLER_NONE = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_NONE,
-        TRACE_CALLER_READ = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ,
-        TRACE_CALLER_WRITE = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_WRITE,
-        TRACE_CALLER_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_TRACE_CALLER_READ_WRITE,
-        CH_NONE = OPTIX_PAYLOAD_SEMANTICS_CH_NONE,
-        CH_READ = OPTIX_PAYLOAD_SEMANTICS_CH_READ,
-        CH_WRITE = OPTIX_PAYLOAD_SEMANTICS_CH_WRITE,
-        CH_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_CH_READ_WRITE,
-        MS_NONE = OPTIX_PAYLOAD_SEMANTICS_MS_NONE,
-        MS_READ = OPTIX_PAYLOAD_SEMANTICS_MS_READ,
-        MS_WRITE = OPTIX_PAYLOAD_SEMANTICS_MS_WRITE,
-        MS_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_MS_READ_WRITE,
-        AH_NONE = OPTIX_PAYLOAD_SEMANTICS_AH_NONE,
-        AH_READ = OPTIX_PAYLOAD_SEMANTICS_AH_READ,
-        AH_WRITE = OPTIX_PAYLOAD_SEMANTICS_AH_WRITE,
-        AH_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_AH_READ_WRITE,
-        IS_NONE = OPTIX_PAYLOAD_SEMANTICS_IS_NONE,
-        IS_READ = OPTIX_PAYLOAD_SEMANTICS_IS_READ,
-        IS_WRITE = OPTIX_PAYLOAD_SEMANTICS_IS_WRITE,
-        IS_READ_WRITE = OPTIX_PAYLOAD_SEMANTICS_IS_READ_WRITE
-
-    class ModuleCompileState(IntFlag):
-        NOT_STARTED = OPTIX_MODULE_COMPILE_STATE_NOT_STARTED,
-        STARTED = OPTIX_MODULE_COMPILE_STATE_STARTED,
-        IMPENDING_FAILURE = OPTIX_MODULE_COMPILE_STATE_IMPENDING_FAILURE,
-        FAILED = OPTIX_MODULE_COMPILE_STATE_FAILED,
-        COMPLETED = OPTIX_MODULE_COMPILE_STATE_COMPLETED,
-
-
-    cdef class Task(OptixObject):
+    Parameters
+    ----------
+    module: Module
+        The module this Task belongs to.
+    """
+    def __init__(self, Module module):
+        self.module = module
+        self.task = <OptixTask>NULL
+
+    def execute(self, max_additional_tasks=2):
         """
-        Class to represent a parallel Task to compile an OptiX module.
-        A Task can be executed in parallel by e.g. a thread pool to handle lots of module compilations concurrently.
-        It is only valid as long as the corresponding module exists, therefore in this wrapper a reference to the module
-        if stored.
+        Execute the Task. If more parallel work is found, it will be returned as a new list of Task objects.
+        The list has a maximum size of max_additional_tasks.
 
-        Note, that a Task is not supposed to be created by the user directly, but provided by the create_as_task method
-        of the Module class.
+        Node, that each Task can only be executed by a single thread.
 
         Parameters
         ----------
-        module: Module
-            The module this Task belongs to.
+        max_additional_tasks: int
+            The maximum number of new Tasks to create from this one
+
+        Returns
+        -------
+        tasks: List[Task]
+            The newly created tasks if any
         """
-        def __init__(self, Module module):
-            self.module = module
-            self.task = <OptixTask>NULL
-
-        def execute(self, max_additional_tasks=2):
-            """
-            Execute the Task. If more parallel work is found, it will be returned as a new list of Task objects.
-            The list has a maximum size of max_additional_tasks.
-
-            Node, that each Task can only be executed by a single thread.
-
-            Parameters
-            ----------
-            max_additional_tasks: int
-                The maximum number of new Tasks to create from this one
-
-            Returns
-            -------
-            tasks: List[Task]
-                The newly created tasks if any
-            """
-            cdef vector[OptixTask] additional_tasks
-            cdef unsigned int i
-            cdef unsigned int additional_tasks_created = 0
-            cdef unsigned int max_num_additional_tasks = max_additional_tasks
-
-            with nogil:
-                additional_tasks.resize(max_num_additional_tasks)
-                optix_check_return(optixTaskExecute(self.task, additional_tasks.data(), max_num_additional_tasks, &additional_tasks_created))
-
-            cdef list tasks = []
-            for i in range(additional_tasks_created):
-                t = Task(self.module)
-                t.task = additional_tasks[i]
-                tasks.append(t)
-            return tasks
+        cdef vector[OptixTask] additional_tasks
+        cdef unsigned int i
+        cdef unsigned int additional_tasks_created = 0
+        cdef unsigned int max_num_additional_tasks = max_additional_tasks
+
+        with nogil:
+            additional_tasks.resize(max_num_additional_tasks)
+            optix_check_return(optixTaskExecute(self.task, additional_tasks.data(), max_num_additional_tasks, &additional_tasks_created))
+
+        cdef list tasks = []
+        for i in range(additional_tasks_created):
+            t = Task(self.module)
+            t.task = additional_tasks[i]
+            tasks.append(t)
+        return tasks
 
 
 cdef class ModuleCompileOptions(OptixObject):
@@ -132,25 +135,22 @@ cdef class ModuleCompileOptions(OptixObject):
         self.compile_options.numBoundValues = 0
         self.compile_options.boundValues = NULL # currently not supported
 
-        IF _OPTIX_VERSION > 70300:
-            if payload_types is None:
-                self.compile_options.numPayloadTypes = 0
-                self.compile_options.payloadTypes = NULL
-            else:
-                # set the payload types for these compile options (this is horrible, i know ;))
-                payload_types = [ensure_iterable(pt) for pt in ensure_iterable(payload_types)] # list of lists
-                self.payload_types.resize(len(payload_types)) # the number of different payload types
-                self.payload_values.resize(self.payload_types.size()) # a vector of semantics for each payload type
-                self.compile_options.numPayloadTypes = self.payload_types.size()
-                for i, payload_values in enumerate(payload_types):
-                    self.payload_types[i].numPayloadValues = len(payload_values)
-                    self.payload_values[i].resize(self.payload_types[i].numPayloadValues)
-                    for j, payload_semantics in enumerate(payload_values):
-                        self.payload_values[i][j] = payload_semantics.value
-                    self.payload_types[i].payloadSemantics = self.payload_values[i].data()
-                self.compile_options.payloadTypes = self.payload_types.data()
-
-
+        if payload_types is None:
+            self.compile_options.numPayloadTypes = 0
+            self.compile_options.payloadTypes = NULL
+        else:
+            # set the payload types for these compile options (this is horrible, i know ;))
+            payload_types = [ensure_iterable(pt) for pt in ensure_iterable(payload_types)] # list of lists
+            self.payload_types.resize(len(payload_types)) # the number of different payload types
+            self.payload_values.resize(self.payload_types.size()) # a vector of semantics for each payload type
+            self.compile_options.numPayloadTypes = self.payload_types.size()
+            for i, payload_values in enumerate(payload_types):
+                self.payload_types[i].numPayloadValues = len(payload_values)
+                self.payload_values[i].resize(self.payload_types[i].numPayloadValues)
+                for j, payload_semantics in enumerate(payload_values):
+                    self.payload_values[i][j] = payload_semantics.value
+                self.payload_types[i].payloadSemantics = self.payload_values[i].data()
+            self.compile_options.payloadTypes = self.payload_types.data()
 
     @property
     def max_register_count(self):
@@ -177,7 +177,7 @@ cdef class ModuleCompileOptions(OptixObject):
         self.compile_options.debugLevel = level.value
 
 
-cdef tuple _nvrtc_compile_flags_default = ('-use_fast_math', '-lineinfo', '-default-device', '-std=c++11', '-rdc', 'true')
+cdef tuple _nvrtc_compile_flags_default = ('-use_fast_math', '-default-device', '-std=c++11', '-rdc', 'true')
 
 def get_default_nvrtc_compile_flags(std=None, rdc=False):
     flags = list(_nvrtc_compile_flags_default[:-3])
@@ -198,25 +198,24 @@ cdef _is_ptx(src):
 cdef class BuiltinISOptions(OptixObject):
     def __init__(self,
                  primitive_type,
-                 build_flags=None,
+                 build_flags,
                  uses_motion_blur=False,
                  curve_endcap_flags=None):
         self.options.builtinISModuleType = primitive_type.value
         self.options.usesMotionBlur = uses_motion_blur
 
-        IF _OPTIX_VERSION > 70300:
-            if build_flags is None:
-                raise ValueError("Parameter build_flags is required for OptiX versions >= 7.4.")
-            self.options.buildFlags = build_flags.value
-            if curve_endcap_flags is None:
-                curve_endcap_flags = CurveEndcapFlags.DEFAULT
-            self.options.curveEndcapFlags = curve_endcap_flags.value
+        self.options.buildFlags = build_flags.value
+        if curve_endcap_flags is None:
+            curve_endcap_flags = CurveEndcapFlags.DEFAULT
+        self.options.curveEndcapFlags = curve_endcap_flags.value
 
 
 cdef class Module(OptixContextObject):
     """
     Class representing a Optix Cuda program that will be called during pipeline execution. Wraps the OptixModule struct.
 
+    TODO: support creating modules through nvcc instead of nvrtc as well to support the new optix-ir format in 7.5
+
     Parameters
     ----------
     context: DeviceContext
@@ -237,19 +236,23 @@ cdef class Module(OptixContextObject):
                  src,
                  ModuleCompileOptions module_compile_options = ModuleCompileOptions(),
                  PipelineCompileOptions pipeline_compile_options = PipelineCompileOptions(),
-                 compile_flags=_nvrtc_compile_flags_default,
+                 compile_flags=None,
                  program_name=None):
         super().__init__(context)
         cdef const char * c_ptx
         cdef unsigned int pipeline_payload_values, i
+
+        if compile_flags is None:
+            compile_flags = _nvrtc_compile_flags_default
+
         self._compile_flags = list(compile_flags)
 
+        if module_compile_options.debug_level != CompileDebugLevel.NONE:
+            self._compile_flags.append("-G")
+            self._compile_flags.append("-lineinfo")
         if src is not None:
             ptx = self.compile_cuda_ptx(src, compile_flags, name=program_name)
             c_ptx = ptx
-            #IF _OPTIX_VERSION > 70300:
-            #    self._check_payload_values(module_compile_options, pipeline_compile_options)
-
             optix_check_return(optixModuleCreateFromPTX(self.context.c_context,
                                      &module_compile_options.compile_options,
                                      &pipeline_compile_options.compile_options,
@@ -258,89 +261,86 @@ cdef class Module(OptixContextObject):
                                      NULL,
                                      NULL,
                                      &self.module))
-
     def __dealloc__(self):
         if <uintptr_t> self.module != 0:
             optix_check_return(optixModuleDestroy(self.module))
 
-    IF _OPTIX_VERSION > 70300:
-        @property
-        def compile_state(self):
-            cdef OptixModuleCompileState state
-            with nogil:
-                optix_check_return(optixModuleGetCompilationState(self.module, &state))
-            return ModuleCompileState(state)
-
-        # @staticmethod
-        # def _check_payload_values(ModuleCompileOptions module_compile_options, PipelineCompileOptions pipeline_compile_options):
-        #     IF _OPTIX_VERSION > 70300:
-        #         # check if the payload values match between the module and pipeline compile options
-        #         pipeline_payload_values = <unsigned int> pipeline_compile_options.compile_options.numPayloadValues
-        #         if module_compile_options.payload_types.size() > 0:
-        #             for i in range(module_compile_options.compile_options.numPayloadTypes):
-        #                 if pipeline_payload_values != module_compile_options.compile_options.payloadTypes[
-        #                     i].numPayloadValues:
-        #                     raise ValueError(
-        #                         f"number of payload values in module compile options at index {i} does not match the num_payload_values in the pipeline_compile_options.")
-        #     return
-
-        @classmethod
-        def create_as_task(cls,
-                             DeviceContext context,
-                             src,
-                             ModuleCompileOptions module_compile_options = ModuleCompileOptions(),
-                             PipelineCompileOptions pipeline_compile_options = PipelineCompileOptions(),
-                             compile_flags=_nvrtc_compile_flags_default,
-                             program_name=None):
-            """
-            Create a module associated with a parallel task.
-            The function will perform just enough work to instantiate the module.
-            Everything else will be done by the task on request.
-
-            Parameters
-            ----------
-            context: DeviceContext
-                The current OptiX context
-            src: str
-                Either a string containing the module's source code or PTX or the path to a file containing it.
-            module_compile_options: ModuleCompileOptions
-                Compile options of this module
-            pipeline_compile_options: PipelineCompileOptions
-                Compile options of the pipeline the module will be used in
-            compile_flags: list[str], optional
-                List of compiler flags to use. If omitted, the default flags are used.
-            program_name: str, optional
-                The name the program is given internally. Of omitted either the filename is used if given or a default name is used.
-
-            Returns
-            -------
-
-            module: Module
-                The created module
-            task: Task
-                The task associated with this module
-
-            """
-            cdef Module module = Module(context, None, compile_flags=compile_flags)
-            cdef const char * c_ptx
-            cdef unsigned int pipeline_payload_values, i
-            #cls._check_payload_values(module_compile_options, pipeline_compile_options)
-
-            ptx = cls.compile_cuda_ptx(src, compile_flags, name=program_name)
-            c_ptx = ptx
+    @property
+    def compile_state(self):
+        cdef OptixModuleCompileState state
+        with nogil:
+            optix_check_return(optixModuleGetCompilationState(self.module, &state))
+        return ModuleCompileState(state)
+
+    # @staticmethod
+    # def _check_payload_values(ModuleCompileOptions module_compile_options, PipelineCompileOptions pipeline_compile_options):
+    #     # check if the payload values match between the module and pipeline compile options
+    #     pipeline_payload_values = <unsigned int> pipeline_compile_options.compile_options.numPayloadValues
+    #     if module_compile_options.payload_types.size() > 0:
+    #         for i in range(module_compile_options.compile_options.numPayloadTypes):
+    #             if pipeline_payload_values != module_compile_options.compile_options.payloadTypes[
+    #                 i].numPayloadValues:
+    #                 raise ValueError(
+    #                     f"number of payload values in module compile options at index {i} does not match the num_payload_values in the pipeline_compile_options.")
+    #     return
+
+    @classmethod
+    def create_as_task(cls,
+                         DeviceContext context,
+                         src,
+                         ModuleCompileOptions module_compile_options = ModuleCompileOptions(),
+                         PipelineCompileOptions pipeline_compile_options = PipelineCompileOptions(),
+                         compile_flags=_nvrtc_compile_flags_default,
+                         program_name=None):
+        """
+        Create a module associated with a parallel task.
+        The function will perform just enough work to instantiate the module.
+        Everything else will be done by the task on request.
+
+        Parameters
+        ----------
+        context: DeviceContext
+            The current OptiX context
+        src: str
+            Either a string containing the module's source code or PTX or the path to a file containing it.
+        module_compile_options: ModuleCompileOptions
+            Compile options of this module
+        pipeline_compile_options: PipelineCompileOptions
+            Compile options of the pipeline the module will be used in
+        compile_flags: list[str], optional
+            List of compiler flags to use. If omitted, the default flags are used.
+        program_name: str, optional
+            The name the program is given internally. Of omitted either the filename is used if given or a default name is used.
 
-            cdef Task task = Task(module)
+        Returns
+        -------
+
+        module: Module
+            The created module
+        task: Task
+            The task associated with this module
+
+        """
+        cdef Module module = Module(context, None, compile_flags=compile_flags)
+        cdef const char * c_ptx
+        cdef unsigned int pipeline_payload_values, i
+        #cls._check_payload_values(module_compile_options, pipeline_compile_options)
 
-            optix_check_return(optixModuleCreateFromPTXWithTasks(context.c_context,
-                                                                 &module_compile_options.compile_options,
-                                                                 &pipeline_compile_options.compile_options,
-                                                                 c_ptx,
-                                                                 len(ptx) + 1,
-                                                                 NULL,
-                                                                 NULL,
-                                                                 &module.module,
-                                                                 &task.task))
-            return module, task
+        ptx = module.compile_cuda_ptx(src, compile_flags, name=program_name)
+        c_ptx = ptx
+
+        cdef Task task = Task(module)
+
+        optix_check_return(optixModuleCreateFromPTXWithTasks(context.c_context,
+                                                             &module_compile_options.compile_options,
+                                                             &pipeline_compile_options.compile_options,
+                                                             c_ptx,
+                                                             len(ptx) + 1,
+                                                             NULL,
+                                                             NULL,
+                                                             &module.module,
+                                                             &task.task))
+        return module, task
 
 
     @classmethod
@@ -370,8 +370,7 @@ cdef class Module(OptixContextObject):
         """
         cdef Module module = cls(context, None)
 
-        IF _OPTIX_VERSION > 70300:
-            cls._check_payload_values(module_compile_options, pipeline_compile_options)
+        #cls._check_payload_values(module_compile_options, pipeline_compile_options)
         optix_check_return(optixBuiltinISModuleGet(context.c_context,
                                                    &module_compile_options.compile_options,
                                                    &pipeline_compile_options.compile_options,
@@ -379,26 +378,63 @@ cdef class Module(OptixContextObject):
         return module
 
     @staticmethod
-    def compile_cuda_ptx(src, compile_flags=_nvrtc_compile_flags_default, name=None, **kwargs):
-        if os.path.exists(src):
-            name = src
+    def get_default_nvrtc_compile_flags(std=None, rdc=False):
+        return get_default_nvrtc_compile_flags(std, rdc)
+
+
+    def compile_cuda_ptx(self, src, compile_flags=_nvrtc_compile_flags_default, name=None, **kwargs):
+        """
+        Compiles a valid source module into the ptx format. Accepts files containing either source code, ptx, or
+        optix-ir code, compiles the source code if necessary and returns valid ptx or optix-ir modules.
+
+        Parameters
+        ----------
+        src: A string containing either the file name of the module to be compiled or the CUDA source code directly.
+        compile_flags: The flags used for the call to the nvptx compiler. If src is compiled already, this is ignored.
+        name: The name of the compiled module. If src is not an inline string, this is ignored and the file name is used
+        kwargs: Additional kwargs passed to the NVRTC compiler. See the _NVRTCProgram in the cupy package for details.
+
+        Returns
+        -------
+        ptx: A compiled ptx string or a string in optix-ir format.
+        """
+        compiled = False
+        if os.path.isfile(src):
+            # if src is a file
+            name, ext = os.path.splitext(src)
+            if ext == '.ptx' or ext == '.optixir':
+                # if the file points to a compiled module (either in ptx or in optixir format) just return it's contents
+                compiled = True
+            # read the file contents
             with open(src, 'r') as f:
                 src = f.read()
-        if _is_ptx(src):
-            return src
+        elif _is_ptx(src):
+            # if the source is in ptx format already (e.g. as an inline string) just return it
+            compiled = True
 
-        elif name is None:
+        if name is None:
             name = "default_program"
 
-        # TODO is there a public API for that?
-        from cupy.cuda.compiler import _NVRTCProgram as NVRTCProgram
-        prog = NVRTCProgram(src, name, **kwargs)
-        flags = list(compile_flags)
-        # get cuda and optix_include_paths
-        cuda_include_path = get_cuda_include_path()
-        optix_include_path = get_optix_include_path()
-
-        flags.extend([f'-I{cuda_include_path}', f'-I{optix_include_path}'])
-        ptx, _ = prog.compile(flags)
-        return ptx
+        if not compiled:
+            # TODO is there a public API for that?
+            from cupy.cuda.compiler import _NVRTCProgram as NVRTCProgram
+            prog = NVRTCProgram(src, name, **kwargs)
+            flags = list(compile_flags)
+            # get cuda and optix_include_paths
+            cuda_include_path = get_cuda_include_path()
+            optix_include_path = get_local_optix_include_path()
+            if optix_include_path is None or not os.path.exists(optix_include_path):
+                # attempt to load the global path if the local path is not available
+                optix_include_path = get_optix_include_path()
+            if optix_include_path is None:
+                raise ValueError("Unable to locate the optix headers. Make sure that either the OPTIX_PATH environement variable is set"
+                                 "correctly or the optix headers are embedded into this package.")
+            if <Module>self.context.log_callback is not None:
+                # hook into the logging system for this output
+                <Module>self.context.log_callback(4, "build", f"Using optix include path: {optix_include_path}")
+            flags.extend([f'-I{cuda_include_path}', f'-I{optix_include_path}'])
+            ptx, _ = prog.compile(flags)
+            return ptx
+        else:
+            return src
 
diff --git a/optix/optix_includes.h b/optix/optix_includes.h
index 9750de6..c8589b5 100644
--- a/optix/optix_includes.h
+++ b/optix/optix_includes.h
@@ -1,5 +1,9 @@
 #pragma once
 
+#if defined(_MSC_VER)
+#define NOMINMAX
+#endif
+
 #include <optix.h>
 #include <optix_stubs.h>
 #include <optix_function_table_definition.h>
@@ -13,4 +17,4 @@ inline void optix_check_return(OptixResult result) {
             ss << ": " << optixGetErrorString(result);
             throw std::runtime_error(ss.str());
         }
-}
\ No newline at end of file
+}
diff --git a/optix/path_utility.py b/optix/path_utility.py
index 756a756..32f9c20 100644
--- a/optix/path_utility.py
+++ b/optix/path_utility.py
@@ -25,6 +25,7 @@
 
 import os
 from itertools import chain
+import pathlib
 
 _cuda_path_cache = 'NOT_INITIALIZED'
 _optix_path_cache = 'NOT_INITIALIZED'
@@ -38,7 +39,9 @@ def get_path(key):
         return tuple()
 
 
-def search_on_path(filenames, keys=('PATH',)):
+def search_on_path(filenames, keys=None):
+    if keys is None:
+        keys = ('PATH',)
     for p in chain(*[get_path(key) for key in keys]):
         for filename in filenames:
             full = os.path.abspath(os.path.join(p, filename))
@@ -47,7 +50,7 @@ def search_on_path(filenames, keys=('PATH',)):
     return None
 
 
-def get_cuda_path(environment_variable='CUDA_ROOT'):
+def get_cuda_path(environment_variable=None):
     global _cuda_path_cache
 
     # Use a magic word to represent the cache not filled because None is a
@@ -55,7 +58,8 @@ def get_cuda_path(environment_variable='CUDA_ROOT'):
     if _cuda_path_cache != 'NOT_INITIALIZED':
         return _cuda_path_cache
 
-    nvcc_path = search_on_path(('nvcc', 'nvcc.exe'), keys=(environment_variable, 'PATH'))
+    nvcc_path = search_on_path(('nvcc', 'nvcc.exe'), keys=(environment_variable, 'PATH') if environment_variable is not
+                                                                                            None else ('PATH',))
     cuda_path_default = None
     if nvcc_path is not None:
         cuda_path_default = os.path.normpath(
@@ -70,7 +74,7 @@ def get_cuda_path(environment_variable='CUDA_ROOT'):
     return _cuda_path_cache
 
 
-def get_cuda_include_path(environment_variable='CUDA_ROOT'):
+def get_cuda_include_path(environment_variable=None):
     cuda_path = get_cuda_path(environment_variable=environment_variable)
     if cuda_path is None:
         return None
@@ -81,7 +85,7 @@ def get_cuda_include_path(environment_variable='CUDA_ROOT'):
         return None
 
 
-def get_optix_path(environment_variable='OPTIX_PATH'):
+def get_optix_path(path_hint=None, environment_variable=None):
     global _optix_path_cache
 
     # Use a magic word to represent the cache not filled because None is a
@@ -89,14 +93,20 @@ def get_optix_path(environment_variable='OPTIX_PATH'):
     if _optix_path_cache != 'NOT_INITIALIZED':
         return _optix_path_cache
 
-    # prefer the dedicated environment variable
-    optix_header_path = search_on_path(('include/optix.h',), keys=(environment_variable,))
-    if optix_header_path is None:
-        # search on the default path
-        optix_header_path = search_on_path(('../optix/include/optix.h',), keys=('PATH',))
+    if path_hint is None:
+        # prefer the dedicated environment variable
+        optix_header_path = search_on_path(('include/optix.h',), keys=(environment_variable,) if environment_variable is not
+                                                                                                 None else None)
+        if optix_header_path is None:
+            # search on the default path
+            optix_header_path = search_on_path(('../optix/include/optix.h',), keys=('PATH', 'OPTIX_PATH'))
 
-    if optix_header_path is not None:
-        optix_header_path = os.path.normpath(os.path.join(os.path.dirname(optix_header_path), '..'))
+        if optix_header_path is not None:
+            optix_header_path = os.path.normpath(os.path.join(os.path.dirname(optix_header_path), '..'))
+    else:
+        optix_header_path = path_hint
+        if not os.path.exists(os.path.join(optix_header_path, "include/optix.h")):
+            raise ValueError(f"Path {optix_header_path} does not contain an optix installation.")
 
     if optix_header_path is not None:
         _optix_path_cache = optix_header_path
@@ -106,7 +116,12 @@ def get_optix_path(environment_variable='OPTIX_PATH'):
     return _optix_path_cache
 
 
-def get_optix_include_path(environment_variable='OPTIX_PATH'):
+def get_local_optix_include_path():
+    local_include_path = pathlib.Path(__file__).parent / "include"
+    return str(local_include_path) if local_include_path.exists() else None
+
+
+def get_optix_include_path(environment_variable=None):
     optix_path = get_optix_path(environment_variable=environment_variable)
     if optix_path is None:
         return None
@@ -115,3 +130,4 @@ def get_optix_include_path(environment_variable='OPTIX_PATH'):
         return optix_include_path
     else:
         return None
+
diff --git a/optix/pipeline.pxd b/optix/pipeline.pxd
index 0806015..5e072a2 100644
--- a/optix/pipeline.pxd
+++ b/optix/pipeline.pxd
@@ -23,35 +23,22 @@ cdef extern from "optix_includes.h" nogil:
         OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS,
         OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING
 
+    cdef enum OptixPrimitiveTypeFlags:
+        OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM,
+        OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE,
+        OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE,
+        OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR,
+        OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM,
+        OPTIX_PRIMITIVE_TYPE_FLAGS_SPHERE,
+        OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE,
 
-    IF _OPTIX_VERSION > 70300:  # switch to new instance flags
-        cdef enum OptixCompileDebugLevel:
-            OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT,
-            OPTIX_COMPILE_DEBUG_LEVEL_NONE,
-            OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL,
-            OPTIX_COMPILE_DEBUG_LEVEL_MODERATE,
-            OPTIX_COMPILE_DEBUG_LEVEL_FULL
-
-        cdef enum OptixPrimitiveTypeFlags:
-            OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE,
-    ELSE:
-        cdef enum OptixCompileDebugLevel:
-            OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT,
-            OPTIX_COMPILE_DEBUG_LEVEL_NONE,
-            OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO,
-            OPTIX_COMPILE_DEBUG_LEVEL_FULL
-
-        cdef enum OptixPrimitiveTypeFlags:
-            OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR,
-            OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE,
+
+    cdef enum OptixCompileDebugLevel:
+        OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT,
+        OPTIX_COMPILE_DEBUG_LEVEL_NONE,
+        OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL,
+        OPTIX_COMPILE_DEBUG_LEVEL_MODERATE,
+        OPTIX_COMPILE_DEBUG_LEVEL_FULL
 
 
     cdef struct OptixPipelineCompileOptions:
@@ -62,6 +49,7 @@ cdef extern from "optix_includes.h" nogil:
         unsigned int exceptionFlags
         const char * pipelineLaunchParamsVariableName
         unsigned int usesPrimitiveTypeFlags
+        int allowOpacityMicromaps
 
 
     cdef struct OptixPipelineLinkOptions:
diff --git a/optix/pipeline.pyx b/optix/pipeline.pyx
index c471b0d..3a30a93 100644
--- a/optix/pipeline.pyx
+++ b/optix/pipeline.pyx
@@ -15,6 +15,9 @@ from .shader_binding_table cimport ShaderBindingTable
 
 optix_init()
 
+__all__ = ['CompileDebugLevel', 'ExceptionFlags', 'TraversableGraphFlags', 'PrimitiveTypeFlags',
+           'PipelineCompileOptions', 'PipelineLinkOptions', 'Pipeline']
+
 
 class ExceptionFlags(IntFlag):
     """
@@ -36,48 +39,29 @@ class TraversableGraphFlags(IntFlag):
     ALLOW_SINGLE_LEVEL_INSTANCING = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING
 
 
-IF _OPTIX_VERSION > 70300:  # switch to new instance flags
-    class CompileDebugLevel(IntEnum):
-        """
-        Wraps the OptixCompileDebugLevel enum.
-        """
-        DEFAULT = OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT,
-        NONE = OPTIX_COMPILE_DEBUG_LEVEL_NONE,
-        MINIMAL = OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL,
-        MODERATE = OPTIX_COMPILE_DEBUG_LEVEL_MODERATE,
-        FULL = OPTIX_COMPILE_DEBUG_LEVEL_FULL
+class PrimitiveTypeFlags(IntFlag):
+    """
+    Wraps the OptixPrimitiveTypeFlags enum.
+    """
+    DEFAULT = 0, # corresponds to CUSTOM | TRIANGLE
+    CUSTOM = OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM,
+    ROUND_QUADRATIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE,
+    ROUND_CUBIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE,
+    ROUND_LINEAR = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR
+    ROUND_CATMULLROM = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM
+    SPHERE = OPTIX_PRIMITIVE_TYPE_FLAGS_SPHERE
+    TRIANGLE = <unsigned int>OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE # fixes negative number error
 
-    class PrimitiveTypeFlags(IntFlag):
-        """
-        Wraps the OptixPrimitiveTypeFlags enum.
-        """
-        DEFAULT = 0, # corresponds to CUSTOM | TRIANGLE
-        CUSTOM = OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM,
-        ROUND_QUADRATIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE,
-        ROUND_CUBIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE,
-        ROUND_LINEAR = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR,
-        ROUND_CATMULLROM = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM,
-        TRIANGLE = <unsigned int>OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE # fixes negative number error
-ELSE:
-    class CompileDebugLevel(IntEnum):
-        """
-        Wraps the OptixCompileDebugLevel enum.
-        """
-        DEFAULT = OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT,
-        NONE = OPTIX_COMPILE_DEBUG_LEVEL_NONE,
-        LINEINFO = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO,
-        FULL = OPTIX_COMPILE_DEBUG_LEVEL_FULL
 
-    class PrimitiveTypeFlags(IntFlag):
-        """
-        Wraps the OptixPrimitiveTypeFlags enum.
-        """
-        DEFAULT = 0, # corresponds to CUSTOM | TRIANGLE
-        CUSTOM = OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM,
-        ROUND_QUADRATIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_QUADRATIC_BSPLINE,
-        ROUND_CUBIC_BSPLINE = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE,
-        ROUND_LINEAR = OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_LINEAR,
-        TRIANGLE = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE
+class CompileDebugLevel(IntEnum):
+    """
+    Wraps the OptixCompileDebugLevel enum.
+    """
+    DEFAULT = OPTIX_COMPILE_DEBUG_LEVEL_DEFAULT,
+    NONE = OPTIX_COMPILE_DEBUG_LEVEL_NONE,
+    MINIMAL = OPTIX_COMPILE_DEBUG_LEVEL_MINIMAL,
+    MODERATE = OPTIX_COMPILE_DEBUG_LEVEL_MODERATE,
+    FULL = OPTIX_COMPILE_DEBUG_LEVEL_FULL
 
 
 cdef class PipelineCompileOptions(OptixObject):
@@ -93,7 +77,8 @@ cdef class PipelineCompileOptions(OptixObject):
                  num_attribute_values = 0,
                  exception_flags = ExceptionFlags.NONE,
                  pipeline_launch_params_variable_name = "params",
-                 uses_primitive_type_flags = PrimitiveTypeFlags.DEFAULT):
+                 uses_primitive_type_flags = PrimitiveTypeFlags.DEFAULT,
+                 allow_opacity_micromaps=False):
         self.uses_motion_blur = uses_motion_blur
         self.traversable_graph_flags = traversable_graph_flags
         self.num_payload_values = num_payload_values
@@ -101,6 +86,7 @@ cdef class PipelineCompileOptions(OptixObject):
         self.exception_flags = exception_flags
         self.pipeline_launch_params_variable_name = pipeline_launch_params_variable_name
         self.uses_primitive_type_flags = uses_primitive_type_flags
+        self.allow_opacity_micromaps = allow_opacity_micromaps
 
     @property
     def uses_motion_blur(self):
@@ -161,6 +147,14 @@ cdef class PipelineCompileOptions(OptixObject):
     def uses_primitive_type_flags(self, flags):
         self.compile_options.usesPrimitiveTypeFlags = flags.value
 
+    @property
+    def allow_opacity_micromaps(self):
+        return self.compile_options.allowOpacityMicromaps
+
+    @allow_opacity_micromaps.setter
+    def allow_opacity_micromaps(self, allow):
+        self.compile_options.allowOpacityMicromaps = allow
+
     @property
     def c_obj(self):
         return <size_t>&self.compile_options
diff --git a/optix/program_group.pyx b/optix/program_group.pyx
index 2d95c24..09523fc 100644
--- a/optix/program_group.pyx
+++ b/optix/program_group.pyx
@@ -7,6 +7,8 @@ from libc.string cimport memset
 from enum import IntEnum
 optix_init()
 
+__all__ = ['ProgramGroup', 'ProgramGroupKind']
+
 class ProgramGroupKind(IntEnum):
     """
     Wraps the OptixProgramGroupKind enum
diff --git a/optix/shader_binding_table.pyx b/optix/shader_binding_table.pyx
index 4577339..8d6cf35 100644
--- a/optix/shader_binding_table.pyx
+++ b/optix/shader_binding_table.pyx
@@ -8,6 +8,9 @@ import numpy as np
 
 optix_init()
 
+__all__ = ['ShaderBindingTable']
+
+
 cdef class ShaderBindingTable(OptixObject):
     """
     Represents a ShaderBindingTable, containing data used by the various programs in the OptiX Pipeline.
diff --git a/optix/struct.pyx b/optix/struct.pyx
index 7f1e53f..3c15488 100644
--- a/optix/struct.pyx
+++ b/optix/struct.pyx
@@ -10,6 +10,8 @@ from collections.abc import Mapping
 
 optix_init()
 
+__all__ = ['SbtRecord', 'LaunchParamsRecord']
+
 def  _aligned_itemsize( formats, alignment ):
     names = []
     for i in range( len(formats ) ):
@@ -18,8 +20,7 @@ def  _aligned_itemsize( formats, alignment ):
     temp_dtype = np.dtype( {
         'names'   : names,
         'formats' : formats,
-        'align'   : True
-        } )
+        }, align=True)
     return round_up( temp_dtype.itemsize, alignment )
 
 def array_to_device_memory(numpy_array, stream=None):
@@ -175,9 +176,8 @@ cdef class _StructHelper:
             'names': names,
             'formats': formats,
             'itemsize': itemsize,
-            'align': True
-        })
-
+        }, align=True)
+        assert dtype.isalignedstruct
         return dtype
 
     def _prepare_array(self, array):
diff --git a/pyproject.toml b/pyproject.toml
index c5ff92b..20f0314 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,3 @@
 [build-system]
-requires = ["setuptools", "wheel", "Cython>=0.29.22,<3"]
+requires = ["setuptools", "wheel", "Cython>=0.29.22,<3", "numpy"]
 build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 2658567..f53caf9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,12 @@
 from setuptools import setup, Extension, find_packages
 from Cython.Build import cythonize
 import re
+import os
 from pathlib import Path
+import shutil
+import numpy
 
+OPTIX_COMPATIBLE_VERSION = (7, 6)
 
 # standalone import of a module (https://stackoverflow.com/a/58423785)
 def import_module_from_path(path):
@@ -26,10 +30,16 @@ def import_module_from_path(path):
 
 
 util = import_module_from_path('optix/path_utility.py')
-cuda_include_path = util.get_cuda_include_path()
-optix_include_path = util.get_optix_include_path()
-if cuda_include_path is None or optix_include_path is None:
-    raise RuntimeError("Cuda or optix not found in the system")
+cuda_include_path = util.get_cuda_include_path(environment_variable='CUDA_PATH')
+optix_include_path = util.get_optix_include_path(environment_variable='OPTIX_PATH')
+print("Found cuda includes at", cuda_include_path)
+print("Found optix includes at", optix_include_path)
+if cuda_include_path is None:
+    raise RuntimeError("CUDA not found in the system, but is required to build this package. Consider setting"
+                       "CUDA_PATH to the location of the local cuda toolkit installation.")
+if optix_include_path is None:
+    raise RuntimeError("OptiX not found in the system, but is required to build this package. Consider setting "
+                       "OPTIX_PATH to the location of the optix SDK.")
 
 optix_version_re = re.compile(r'.*OPTIX_VERSION +(\d{5})')  # get the optix version from the header
 with open(Path(optix_include_path) / "optix.h", 'r') as f:
@@ -40,25 +50,87 @@ def import_module_from_path(path):
 optix_version_minor = (optix_version % 10000) // 100
 optix_version_micro = optix_version % 100
 
-print(f"Found OptiX version {optix_version_major}.{optix_version_minor}.{optix_version_micro}.")
+if (optix_version_major, optix_version_minor) != OPTIX_COMPATIBLE_VERSION:
+    raise ValueError(f"Found unsupported optix version {optix_version_major}.{optix_version_minor}.{optix_version_micro}. This package"
+                     f"requires an optix version of {OPTIX_COMPATIBLE_VERSION[0]}.{OPTIX_COMPATIBLE_VERSION[1]}.x.")
 
 cython_compile_env = {
-    '_OPTIX_VERSION': optix_version,
     '_OPTIX_VERSION_MAJOR': optix_version_major,
     '_OPTIX_VERSION_MINOR': optix_version_minor,
     '_OPTIX_VERSION_MICRO': optix_version_micro
 }
 
+libraries=[]
+if os.name == 'nt':
+    # OptiX uses some Windows Registry API(e.g. RegCloseKey)
+    libraries.append('advapi32')
+
 extensions = [Extension("*", ["optix/*.pyx"],
-                        include_dirs=[cuda_include_path, optix_include_path])]
+                        include_dirs=[cuda_include_path, optix_include_path, numpy.get_include()], libraries=libraries)]
 extensions = cythonize(extensions, language_level="3",
-                        compile_time_env=cython_compile_env, build_dir="build")
+                        compile_time_env=cython_compile_env, build_dir="build", annotate=True)
 
 with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
 
 version = import_module_from_path('optix/_version.py').__version__
 
+package_data = {}
+
+
+def glob_fix(package_name, glob):
+    # this assumes setup.py lives in the folder that contains the package
+    package_path = Path(f'./{package_name}').resolve()
+    return [str(path.relative_to(package_path))
+            for path in package_path.glob(glob)]
+
+from setuptools.command.install import install as _install
+from setuptools.command.develop import develop as _develop
+
+class EmbeddHeadersCommandMixin:
+    def update_package_data(self):
+        self.distribution.package_data.update({
+            'optix': [*glob_fix('optix', 'include/**/*')]
+        })
+        print("embedding optix headers into package data",
+              self.distribution.package_data)
+
+    def run(self):
+        embedd = os.getenv("OPTIX_EMBED_HEADERS")
+        if embedd:
+            # create the path for the internal headers
+            # due to optix license restrictions those headers
+            # cannot be distributed on pypi directly so we will add this headers dynamically
+            # upon wheel construction to install them alongside the package
+
+            if not os.path.exists('optix/include/optix.h'):
+                shutil.copytree(optix_include_path, 'optix/include')
+
+            self.update_package_data()
+
+        super().run()
+
+
+class CustomInstallCommand(EmbeddHeadersCommandMixin, _install):
+    pass
+
+
+class CustomDevelopCommand(EmbeddHeadersCommandMixin, _develop):
+    pass
+
+
+cmd_classes = {'install': CustomInstallCommand,
+               'develop': CustomDevelopCommand}
+
+try:
+    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
+    class CustomBdistWheelCommand(EmbeddHeadersCommandMixin, _bdist_wheel):
+        pass
+    cmd_classes['bdist_wheel'] = CustomBdistWheelCommand
+except ImportError:
+    CustomBdistWheel = None
+
 setup(
     name="python-optix",
     version=version,
@@ -81,8 +153,10 @@ def import_module_from_path(path):
     classifiers=[
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
         "License :: OSI Approved :: MIT License",
         "Operating System :: POSIX :: Linux",
+        "Operating System :: Microsoft :: Windows",
         "Environment :: GPU :: NVIDIA CUDA",
         "Development Status :: 4 - Beta",
         "Intended Audience :: Science/Research",
@@ -94,5 +168,7 @@ def import_module_from_path(path):
         'examples': ["pillow", "pyopengl", "pyglfw", "pyimgui"]
     },
     python_requires=">=3.8",
-    zip_safe=False
+    package_data=package_data,
+    zip_safe=False,
+    cmdclass=cmd_classes
 )