diff --git a/mlir/cuda-tile/.clang-format b/mlir/cuda-tile/.clang-format
new file mode 100644
index 0000000..34ac64a
--- /dev/null
+++ b/mlir/cuda-tile/.clang-format
@@ -0,0 +1,5 @@
+BasedOnStyle: LLVM
+LineEnding: LF
+IndentWidth: 2
+TabWidth: 2
+UseTab: Never
diff --git a/mlir/cuda-tile/.devcontainer/Dockerfile b/mlir/cuda-tile/.devcontainer/Dockerfile
new file mode 100644
index 0000000..5ffc399
--- /dev/null
+++ b/mlir/cuda-tile/.devcontainer/Dockerfile
@@ -0,0 +1,41 @@
+FROM alwaysproblem/fastdev-u2204:nv13.1.0
+
+ARG UID=1000
+ARG GID=1000
+
+RUN echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" > /etc/apt/sources.list.d/llvm.list \
+    && echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" >> /etc/apt/sources.list.d/llvm.list \
+    && echo "# 20" >> /etc/apt/sources.list.d/llvm.list \
+    && echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list.d/llvm.list \
+    && echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list.d/llvm.list \
+    && echo "# 21" >> /etc/apt/sources.list.d/llvm.list \
+    && echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-21 main" >> /etc/apt/sources.list.d/llvm.list \
+    && echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-21 main" >> /etc/apt/sources.list.d/llvm.list \
+    && wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc \
+    && apt update -y && \
+    apt install -y \
+    python3 python3-dev python3-setuptools python3-pip \
+    libtinfo-dev zlib1g-dev \
+    build-essential cmake ninja-build \
+    clang-20 clang-tidy-20 clangd-20 cmake-format \
+    clang-format-20 lldb-20 lld-20 libfmt-dev libspdlog-dev \
+    && apt clean -y && rm -rf /var/lib/apt/lists/* \
+    && update-alternatives --install /usr/bin/clang clang /usr/bin/clang-20 100 \
+    && update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-20 100 \
+    && update-alternatives --install /usr/bin/clangd clangd /usr/bin/clangd-20 100 \
+    && update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-20 100 \
+    && update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-20 100 \
+    && update-alternatives --install /usr/bin/lld lld /usr/bin/lld-20 100 \
+    && update-alternatives --install /usr/bin/lldb lldb /usr/bin/lldb-20 100
+
+RUN apt update -y && apt install -yq software-properties-common \
+    && add-apt-repository -y ppa:ubuntu-toolchain-r/test \
+    && apt update -yq \
+    && apt install -yq gcc-13 g++-13 gdb \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 \
+    && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 100 \
+    && apt clean -y && rm -rf /var/lib/apt/lists/*
+
+RUN git config --global --add safe.directory '*' && \
+    /root/.local/bin/setup_new_user ${UID} ${GID} && \
+    python3 -m pip install pre-commit compdb
diff --git a/mlir/cuda-tile/.devcontainer/devcontainer.json b/mlir/cuda-tile/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..927324f
--- /dev/null
+++ b/mlir/cuda-tile/.devcontainer/devcontainer.json
@@ -0,0 +1,77 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
+{
+  "remoteUser": "root",
+  "name": "mlir-example",
+  "workspaceMount": "source=${localWorkspaceFolder},target=${localWorkspaceFolder}/../../../MLcompiler-tutorial/mlir/${localWorkspaceFolderBasename},type=bind",
+  "workspaceFolder": "/root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/${localWorkspaceFolderBasename}",
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "Dockerfile",
+    "options": [
+      "--network=host"
+    ],
+    "args": {
+      "UID": "1000",
+      "GID": "1000"
+    }
+  },
+  // Features to add to the dev container. More info: https://containers.dev/features.
+  // "features": {},
+  // Use 'forwardPorts' to make a list of ports inside the container available locally.
+  // "forwardPorts": [],
+  // Use 'postCreateCommand' to run commands after the container is created.
+  // "postCreateCommand": "python --version",
+  // Configure tool-specific properties.
+  // "customizations": {},
+  // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+  // "remoteUser": "root"
+  "privileged": true,
+  // "capAdd": ["SYS_PTRACE"],
+  "mounts": [
+    {
+      "source": "${localWorkspaceFolder}/../../../",
+      "target": "/root/Desktop/dockerVolumn",
+      "type": "bind"
+    }
+  ],
+  "runArgs": [
+    // "--cap-add=SYS_PTRACE",
+    // "--security-opt",
+    // "seccomp=unconfined",
+    "--name",
+    // "${localEnv:USER}-tvm",
+    "yyx-cuda-tile",
+    // "-v",
+    // "/data/rech/yongxiy/Desktop/dockerVolumn:/root/Desktop/dockerVolumn"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "jeff-hykin.better-cpp-syntax",
+        "aaron-bond.better-comments",
+        "ms-vscode.cpptools-themes",
+        "revng.llvm-ir",
+        "jakob-erzar.llvm-tablegen",
+        "MomenAbdelkarim-WyattCalandro-LuisPrieto.mlir",
+        "ms-vscode.cpptools",
+        "ms-vscode.cpptools-extension-pack",
+        "twxs.cmake",
+        "josetr.cmake-language-support-vscode",
+        "ms-vscode.cmake-tools",
+        "cheshirekow.cmake-format",
+        "yzhang.markdown-all-in-one",
+        "bierner.markdown-preview-github-styles",
+        "bierner.markdown-mermaid",
+        "DavidAnson.vscode-markdownlint",
+        "llvm-vs-code-extensions.vscode-mlir",
+        "llvm-vs-code-extensions.vscode-clangd",
+        "llvm-vs-code-extensions.lldb-dap",
+        "mutantdino.resourcemonitor",
+        "hoovercj.vscode-power-mode",
+        "GitHub.copilot-chat",
+        "Codereviewforgithubcopilot.github-copilot-code-review"
+      ]
+    }
+  }
+}
diff --git a/mlir/cuda-tile/.devcontainer/noop.txt b/mlir/cuda-tile/.devcontainer/noop.txt
new file mode 100644
index 0000000..49de88d
--- /dev/null
+++ b/mlir/cuda-tile/.devcontainer/noop.txt
@@ -0,0 +1,3 @@
+This file copied into the container along with environment.yml* from the parent
+folder. This file is included to prevents the Dockerfile COPY instruction from
+failing if no environment.yml is found.
diff --git a/mlir/cuda-tile/.envsetup.sh b/mlir/cuda-tile/.envsetup.sh
new file mode 100644
index 0000000..b4f37a3
--- /dev/null
+++ b/mlir/cuda-tile/.envsetup.sh
@@ -0,0 +1 @@
+source /root/miniconda3/etc/profile.d/conda.sh && conda activate mlir
diff --git a/mlir/cuda-tile/.gitignore b/mlir/cuda-tile/.gitignore
new file mode 100644
index 0000000..288c272
--- /dev/null
+++ b/mlir/cuda-tile/.gitignore
@@ -0,0 +1,3 @@
+*.ptx
+*.cubin
+*.fatbin
diff --git a/mlir/cuda-tile/.pre-commit-config.yaml b/mlir/cuda-tile/.pre-commit-config.yaml
new file mode 100644
index 0000000..5736549
--- /dev/null
+++ b/mlir/cuda-tile/.pre-commit-config.yaml
@@ -0,0 +1,19 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.3.0
+  hooks:
+    - id: check-yaml
+    - id: trailing-whitespace
+    - id: end-of-file-fixer
+
+- repo: https://github.com/pre-commit/mirrors-clang-format
+  rev: 'v14.0.6'
+  hooks:
+  - id: clang-format
+    types_or: [c++, c]
+
+- repo: https://github.com/cheshirekow/cmake-format-precommit
+  rev: v0.6.10
+  hooks:
+  - id: cmake-format
+  - id: cmake-lint
diff --git a/mlir/cuda-tile/CMakeLists.txt b/mlir/cuda-tile/CMakeLists.txt
new file mode 100644
index 0000000..9eb4fb1
--- /dev/null
+++ b/mlir/cuda-tile/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.10)
+
+# note: fix ztd terminfo not found
+project(cuda-tile LANGUAGES C CXX)
+
+# ############## For conda users.################################
+find_package(LLVM CONFIG REQUIRED)
+find_package(MLIR CONFIG REQUIRED)
+# set(MLIR_TABLEGEN_EXE /root/anaconda3/envs/mlir/bin/mlir-tblgen)
+# ##############################################################################
+
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+message(STATUS "Found MLIR ${MLIR_PACKAGE_VERSION}")
+message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
+message(STATUS "Found MLIRTableGen: ${MLIR_TABLEGEN_EXE}")
+message(STATUS "LLVM_INCLUDE_DIR include dir: ${LLVM_INCLUDE_DIR}")
+message(STATUS "MLIR_INCLUDE_DIR include dir: ${MLIR_INCLUDE_DIR}")
+
+# This is for non-conda users. 
+find_package(LLVM CONFIG PATHS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/lib/cmake/llvm) 
+find_package(MLIR CONFIG PATHS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/lib/cmake/mlir)
+find_package(CUDAToolkit REQUIRED)
+# set(MLIR_TABLEGEN_EXE ${CMAKE_CURRENT_SOURCE_DIR}/third_party/bin/mlir-tblgen)
+message(STATUS "CUDA Toolkit found: ${CUDAToolkit_INCLUDE_DIRS}")
+message(STATUS "CUDA_TILE_SOURCE_DIR include dir: ${CUDA_TILE_SOURCE_DIR}")
+message(STATUS "CUDA_TILE_BINARY_DIR include dir: ${CUDA_TILE_BINARY_DIR}")
+
+include_directories(${LLVM_INCLUDE_DIR})
+include_directories(${MLIR_INCLUDE_DIR})
+include_directories(${CUDAToolkit_INCLUDE_DIRS})
+include_directories(${CUDA_TILE_SOURCE_DIR}/include)
+include_directories(${CUDA_TILE_BINARY_DIR}/include)
+
+include(LLVMDistributionSupport)
+include(TableGen)
+include(AddMLIR)
+include(AddLLVM)
+# include(HandleLLVMOptions)
+
+# note: fix the llvm::cl undefined reference problem
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -fno-rtti")
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+
+add_subdirectory(Toy)
diff --git a/mlir/cuda-tile/README.md b/mlir/cuda-tile/README.md
new file mode 100644
index 0000000..7ca5b87
--- /dev/null
+++ b/mlir/cuda-tile/README.md
@@ -0,0 +1,569 @@
+# Standalone environment for MLIR tutorial.
+
+**NB: The code of this tutorial is from the [mlir-Toy-Example-tutorial](https://mlir.llvm.org/docs/Tutorials/Toy/Ch-1/) and [mlir-transform-tutorial](https://mlir.llvm.org/docs/Tutorials/transform/).
+This repo only provide a simple way to setting up the environment. The toy file used in mlir-example all be in [example directory](../example/) and `Ch1-Ch7` is the Toy tutorial example code `Ch8` is an naive example to add `toy.matmul` operation and `transform_Ch2-H` is for transform dialect tutorials**
+
+## Environment Setup
+
+### Environment Preparation with conda (Optional)
+
+- OS must be higher than ubuntu 22.04.
+- install gcc-13 and g++-13
+
+```bash
+apt update -y && \
+apt install -yq gcc-13 g++-13
+# apt install -yq software-properties-common \
+# add-apt-repository -y ppa:ubuntu-toolchain-r/test \
+# apt update -y
+# apt install -yq gcc-11 g++-11
+update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 20
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 20
+```
+
+- install cmake and ninja you can choose one way you like. conda is best for me.
+
+```bash
+conda create -n mlir -y
+conda activate mlir
+# conda install cmake ninja clang-format clang lld ncurses mlir llvm -c conda-forge
+conda install cmake ninja clang-format clang clang-tools mlir zlib spdlog fmt lit llvm=19.* -c conda-forge -y
+# create -n mlir cmake ninja clang-format clang mlir zlib spdlog fmt lit llvm -c conda-forge -y
+```
+
+- build example with conda
+
+```bash
+cd example
+bash build_with_conda.sh all
+```
+
+### Environment Preparation with dev containers
+
+Please choose the `Dev Containers: Open Folder in Container...`
+
+- build example with dev containers
+
+```bash
+cd example
+bash scripts/sync_deps.sh
+bash scripts/build_deps.sh
+bash scripts/build_cuda_tile.sh
+bash build.sh all
+```
+
+## Configure the Clangd
+
+```bash
+cd example
+# after you configure the project with cmake, you can configure the clangd by run the following command
+compdb -p build list > compile_commands.json
+```
+
+## Run These code and understand mlir
+
+### Toy Examples
+
+- Ch1
+
+```bash
+$./build/Ch1/mlir-example-ch1 Ch1/example.toy -emit=ast
+# Module:
+#   Function
+#     Proto 'main' @Ch1/example.toy:1:1
+#     Params: []
+#     Block {
+#       VarDecl a<> @Ch1/example.toy:4:3
+#         Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @Ch1/example.toy:4:11
+#       VarDecl b<2, 3> @Ch1/example.toy:8:3
+#         Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @Ch1/example.toy:8:17
+#       Print [ @Ch1/example.toy:12:3
+#         BinOp: * @Ch1/example.toy:12:24
+#           Call 'transpose' [ @Ch1/example.toy:12:9
+#             var: a @Ch1/example.toy:12:19
+#           ]
+#           Call 'transpose' [ @Ch1/example.toy:12:24
+#             var: b @Ch1/example.toy:12:34
+#           ]
+#       ]
+#     } // Block
+```
+
+- Ch2
+
+```bash
+$./build/Ch2/mlir-example-ch2 Ch2/codegen.toy -emit=mlir
+# module {
+#   toy.func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+#     %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+#     %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+#     %2 = toy.mul %0, %1 : tensor<*xf64>
+#     toy.return %2 : tensor<*xf64>
+#   }
+#   toy.func @main() {
+#     %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+#     %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+#     %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+#     %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     toy.print %5 : tensor<*xf64>
+#     toy.return
+#   }
+# }
+```
+
+- Ch3
+
+```bash
+$./build/Ch3/mlir-example-ch3 Ch3/opt.toy  -emit=mlir
+# module {
+#   toy.func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+#     %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+#     %1 = toy.transpose(%0 : tensor<*xf64>) to tensor<*xf64>
+#     %2 = toy.transpose(%1 : tensor<*xf64>) to tensor<*xf64>
+#     %3 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+#     %4 = toy.mul %2, %3 : tensor<*xf64>
+#     toy.return %4 : tensor<*xf64>
+#   }
+#   toy.func @main() {
+#     %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+#     %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+#     %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+#     %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     %6 = toy.constant dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>
+#     %7 = toy.reshape(%6 : tensor<2xf64>) to tensor<2x1xf64>
+#     %8 = toy.reshape(%7 : tensor<2x1xf64>) to tensor<2x1xf64>
+#     %9 = toy.reshape(%8 : tensor<2x1xf64>) to tensor<2x1xf64>
+#     toy.print %5 : tensor<*xf64>
+#     toy.return
+#   }
+# }
+$./build/Ch3/mlir-example-ch3 Ch3/opt.toy  -emit=mlir -opt
+# module {
+#   toy.func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+#     %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+#     %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+#     %2 = toy.mul %0, %1 : tensor<*xf64>
+#     toy.return %2 : tensor<*xf64>
+#   }
+#   toy.func @main() {
+#     %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %2 = toy.generic_call @multiply_transpose(%0, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     %3 = toy.generic_call @multiply_transpose(%1, %0) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     toy.print %3 : tensor<*xf64>
+#     toy.return
+#   }
+# }
+```
+
+- Ch4
+
+```bash
+$./build/Ch4/mlir-example-ch4 Ch4/opt.toy  -emit=mlir
+# module {
+#   toy.func private @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+#     %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+#     %1 = toy.transpose(%0 : tensor<*xf64>) to tensor<*xf64>
+#     %2 = toy.transpose(%1 : tensor<*xf64>) to tensor<*xf64>
+#     %3 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+#     %4 = toy.mul %2, %3 : tensor<*xf64>
+#     toy.return %4 : tensor<*xf64>
+#   }
+#   toy.func @main() {
+#     %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+#     %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+#     %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64>
+#     %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64>
+#     %6 = toy.constant dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>
+#     %7 = toy.reshape(%6 : tensor<2xf64>) to tensor<2x1xf64>
+#     %8 = toy.reshape(%7 : tensor<2x1xf64>) to tensor<2x1xf64>
+#     %9 = toy.reshape(%8 : tensor<2x1xf64>) to tensor<2x1xf64>
+#     toy.print %5 : tensor<*xf64>
+#     toy.return
+#   }
+# }
+$./build/Ch4/mlir-example-ch4 Ch4/opt.toy  -emit=mlir -opt
+# module {
+#   toy.func @main() {
+#     %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %1 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+#     %2 = toy.mul %1, %1 : tensor<3x2xf64>
+#     toy.print %2 : tensor<3x2xf64>
+#     toy.return
+#   }
+# }
+```
+
+- Ch5
+
+```bash
+$ ./build/Ch5/mlir-example-ch5 Ch5/example.toy -emit=mlir-affine
+# module {
+#   func.func @main() {
+#     %cst = arith.constant 6.000000e+00 : f64
+#     %cst_0 = arith.constant 5.000000e+00 : f64
+#     %cst_1 = arith.constant 4.000000e+00 : f64
+#     %cst_2 = arith.constant 3.000000e+00 : f64
+#     %cst_3 = arith.constant 2.000000e+00 : f64
+#     %cst_4 = arith.constant 1.000000e+00 : f64
+#     %0 = memref.alloc() : memref<3x2xf64>
+#     %1 = memref.alloc() : memref<3x2xf64>
+#     %2 = memref.alloc() : memref<2x3xf64>
+#     affine.store %cst_4, %2[0, 0] : memref<2x3xf64>
+#     affine.store %cst_3, %2[0, 1] : memref<2x3xf64>
+#     affine.store %cst_2, %2[0, 2] : memref<2x3xf64>
+#     affine.store %cst_1, %2[1, 0] : memref<2x3xf64>
+#     affine.store %cst_0, %2[1, 1] : memref<2x3xf64>
+#     affine.store %cst, %2[1, 2] : memref<2x3xf64>
+#     affine.for %arg0 = 0 to 3 {
+#       affine.for %arg1 = 0 to 2 {
+#         %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64>
+#         affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64>
+#       }
+#     }
+#     affine.for %arg0 = 0 to 3 {
+#       affine.for %arg1 = 0 to 2 {
+#         %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+#         %4 = arith.mulf %3, %3 : f64
+#         affine.store %4, %0[%arg0, %arg1] : memref<3x2xf64>
+#       }
+#     }
+#     toy.print %0 : memref<3x2xf64>
+#     memref.dealloc %2 : memref<2x3xf64>
+#     memref.dealloc %1 : memref<3x2xf64>
+#     memref.dealloc %0 : memref<3x2xf64>
+#     return
+#   }
+# }
+$ ./build/Ch5/mlir-example-ch5 Ch5/example.toy -emit=mlir-affine -opt
+# module {
+#   func.func @main() {
+#     %cst = arith.constant 6.000000e+00 : f64
+#     %cst_0 = arith.constant 5.000000e+00 : f64
+#     %cst_1 = arith.constant 4.000000e+00 : f64
+#     %cst_2 = arith.constant 3.000000e+00 : f64
+#     %cst_3 = arith.constant 2.000000e+00 : f64
+#     %cst_4 = arith.constant 1.000000e+00 : f64
+#     %0 = memref.alloc() : memref<3x2xf64>
+#     %1 = memref.alloc() : memref<2x3xf64>
+#     affine.store %cst_4, %1[0, 0] : memref<2x3xf64>
+#     affine.store %cst_3, %1[0, 1] : memref<2x3xf64>
+#     affine.store %cst_2, %1[0, 2] : memref<2x3xf64>
+#     affine.store %cst_1, %1[1, 0] : memref<2x3xf64>
+#     affine.store %cst_0, %1[1, 1] : memref<2x3xf64>
+#     affine.store %cst, %1[1, 2] : memref<2x3xf64>
+#     affine.for %arg0 = 0 to 3 {
+#       affine.for %arg1 = 0 to 2 {
+#         %2 = affine.load %1[%arg1, %arg0] : memref<2x3xf64>
+#         %3 = arith.mulf %2, %2 : f64
+#         affine.store %3, %0[%arg0, %arg1] : memref<3x2xf64>
+#       }
+#     }
+#     toy.print %0 : memref<3x2xf64>
+#     memref.dealloc %1 : memref<2x3xf64>
+#     memref.dealloc %0 : memref<3x2xf64>
+#     return
+#   }
+# }
+```
+
+- Ch6
+
+```bash
+$ ./build/Ch6/mlir-example-ch6 Ch6/example.toy -emit=jit
+# 1.000000 16.000000
+# 4.000000 25.000000
+# 9.000000 36.000000
+
+$ ./build/Ch6/mlir-example-ch6 Ch6/example.toy -emit=llvm --mlir-print-ir-after-all
+# // -----// IR Dump After Canonicalizer (canonicalize) //----- //
+# toy.func @main() {
+#   %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#   %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#   %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<*xf64>
+#   %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<*xf64>
+#   %4 = toy.mul %2, %3 : tensor<*xf64>
+#   toy.print %4 : tensor<*xf64>
+#   toy.return
+# }
+
+# // -----// IR Dump After Inliner (inline) //----- //
+# module {
+#   toy.func @main() {
+#     %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<*xf64>
+#     %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<*xf64>
+#     %4 = toy.mul %2, %3 : tensor<*xf64>
+#     toy.print %4 : tensor<*xf64>
+#     toy.return
+#   }
+# }
+
+
+# // -----// IR Dump After {anonymous}::ShapeInferencePass () //----- //
+# toy.func @main() {
+#   %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#   %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#   %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+#   %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<3x2xf64>
+#   %4 = toy.mul %2, %3 : tensor<3x2xf64>
+#   toy.print %4 : tensor<3x2xf64>
+#   toy.return
+# }
+
+# // -----// IR Dump After Canonicalizer (canonicalize) //----- //
+# toy.func @main() {
+#   %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#   %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#   %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+#   %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<3x2xf64>
+#   %4 = toy.mul %2, %3 : tensor<3x2xf64>
+#   toy.print %4 : tensor<3x2xf64>
+#   toy.return
+# }
+
+# // -----// IR Dump After CSE (cse) //----- //
+# toy.func @main() {
+#   %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#   %1 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64>
+#   %2 = toy.mul %1, %1 : tensor<3x2xf64>
+#   toy.print %2 : tensor<3x2xf64>
+#   toy.return
+# }
+
+# // -----// IR Dump After {anonymous}::ToyToAffineLoweringPass () //----- //
+# module {
+#   func.func @main() {
+#     %0 = memref.alloc() : memref<3x2xf64>
+#     %1 = memref.alloc() : memref<3x2xf64>
+#     %2 = memref.alloc() : memref<2x3xf64>
+#     %c0 = arith.constant 0 : index
+#     %c1 = arith.constant 1 : index
+#     %c2 = arith.constant 2 : index
+#     %cst = arith.constant 1.000000e+00 : f64
+#     affine.store %cst, %2[%c0, %c0] : memref<2x3xf64>
+#     %cst_0 = arith.constant 2.000000e+00 : f64
+#     affine.store %cst_0, %2[%c0, %c1] : memref<2x3xf64>
+#     %cst_1 = arith.constant 3.000000e+00 : f64
+#     affine.store %cst_1, %2[%c0, %c2] : memref<2x3xf64>
+#     %cst_2 = arith.constant 4.000000e+00 : f64
+#     affine.store %cst_2, %2[%c1, %c0] : memref<2x3xf64>
+#     %cst_3 = arith.constant 5.000000e+00 : f64
+#     affine.store %cst_3, %2[%c1, %c1] : memref<2x3xf64>
+#     %cst_4 = arith.constant 6.000000e+00 : f64
+#     affine.store %cst_4, %2[%c1, %c2] : memref<2x3xf64>
+#     affine.for %arg0 = 0 to 3 {
+#       affine.for %arg1 = 0 to 2 {
+#         %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64>
+#         affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64>
+#       }
+#     }
+#     affine.for %arg0 = 0 to 3 {
+#       affine.for %arg1 = 0 to 2 {
+#         %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+#         %4 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+#         %5 = arith.mulf %3, %4 : f64
+#         affine.store %5, %0[%arg0, %arg1] : memref<3x2xf64>
+#       }
+#     }
+#     toy.print %0 : memref<3x2xf64>
+#     memref.dealloc %2 : memref<2x3xf64>
+#     memref.dealloc %1 : memref<3x2xf64>
+#     memref.dealloc %0 : memref<3x2xf64>
+#     return
+#   }
+# }
+
+
+# // -----// IR Dump After Canonicalizer (canonicalize) //----- //
+# func.func @main() {
+#   %cst = arith.constant 6.000000e+00 : f64
+#   %cst_0 = arith.constant 5.000000e+00 : f64
+#   %cst_1 = arith.constant 4.000000e+00 : f64
+#   %cst_2 = arith.constant 3.000000e+00 : f64
+#   %cst_3 = arith.constant 2.000000e+00 : f64
+#   %cst_4 = arith.constant 1.000000e+00 : f64
+#   %0 = memref.alloc() : memref<3x2xf64>
+#   %1 = memref.alloc() : memref<3x2xf64>
+#   %2 = memref.alloc() : memref<2x3xf64>
+#   affine.store %cst_4, %2[0, 0] : memref<2x3xf64>
+#   affine.store %cst_3, %2[0, 1] : memref<2x3xf64>
+#   affine.store %cst_2, %2[0, 2] : memref<2x3xf64>
+#   affine.store %cst_1, %2[1, 0] : memref<2x3xf64>
+#   affine.store %cst_0, %2[1, 1] : memref<2x3xf64>
+#   affine.store %cst, %2[1, 2] : memref<2x3xf64>
+#   affine.for %arg0 = 0 to 3 {
+#     affine.for %arg1 = 0 to 2 {
+#       %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64>
+#       affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64>
+#     }
+#   }
+#   affine.for %arg0 = 0 to 3 {
+#     affine.for %arg1 = 0 to 2 {
+#       %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+#       %4 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+#       %5 = arith.mulf %3, %4 : f64
+#       affine.store %5, %0[%arg0, %arg1] : memref<3x2xf64>
+#     }
+#   }
+#   toy.print %0 : memref<3x2xf64>
+#   memref.dealloc %2 : memref<2x3xf64>
+#   memref.dealloc %1 : memref<3x2xf64>
+#   memref.dealloc %0 : memref<3x2xf64>
+#   return
+# }
+
+# // -----// IR Dump After CSE (cse) //----- //
+# func.func @main() {
+#   %cst = arith.constant 6.000000e+00 : f64
+#   %cst_0 = arith.constant 5.000000e+00 : f64
+#   %cst_1 = arith.constant 4.000000e+00 : f64
+#   %cst_2 = arith.constant 3.000000e+00 : f64
+#   %cst_3 = arith.constant 2.000000e+00 : f64
+#   %cst_4 = arith.constant 1.000000e+00 : f64
+#   %0 = memref.alloc() : memref<3x2xf64>
+#   %1 = memref.alloc() : memref<3x2xf64>
+#   %2 = memref.alloc() : memref<2x3xf64>
+#   affine.store %cst_4, %2[0, 0] : memref<2x3xf64>
+#   affine.store %cst_3, %2[0, 1] : memref<2x3xf64>
+#   affine.store %cst_2, %2[0, 2] : memref<2x3xf64>
+#   affine.store %cst_1, %2[1, 0] : memref<2x3xf64>
+#   affine.store %cst_0, %2[1, 1] : memref<2x3xf64>
+#   affine.store %cst, %2[1, 2] : memref<2x3xf64>
+#   affine.for %arg0 = 0 to 3 {
+#     affine.for %arg1 = 0 to 2 {
+#       %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64>
+#       affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64>
+#     }
+#   }
+#   affine.for %arg0 = 0 to 3 {
+#     affine.for %arg1 = 0 to 2 {
+#       %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64>
+#       %4 = arith.mulf %3, %3 : f64
+#       affine.store %4, %0[%arg0, %arg1] : memref<3x2xf64>
+#     }
+#   }
+#   toy.print %0 : memref<3x2xf64>
+#   memref.dealloc %2 : memref<2x3xf64>
+#   memref.dealloc %1 : memref<3x2xf64>
+#   memref.dealloc %0 : memref<3x2xf64>
+#   return
+# }
+
+# // -----// IR Dump After {anonymous}::ToyToLLVMLoweringPass () //----- //
+# module {
+#   llvm.func @free(!llvm.ptr<i8>)
+#   llvm.mlir.global internal constant @nl("\0A\00")
+#   llvm.mlir.global internal constant @frmt_spec("%f \00")
+#   llvm.func @printf(!llvm.ptr<i8>, ...) -> i32
+#   llvm.func @malloc(i64) -> !llvm.ptr<i8>
+#   llvm.func @main() {
+# ...
+```
+
+- Ch7
+
+```bash
+$ ./build/Ch7/mlir-example-ch7 Ch7/struct-codegen.toy -emit=jit
+# 1.000000 16.000000
+# 4.000000 25.000000
+# 9.000000 36.000000
+```
+
+- Ch8
+
+```bash
+$ ./build/Ch8/mlir-example-ch8 Ch8/matmul.toy.mlir -emit=mlir
+# module {
+#   toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+#     %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+#     %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+#     %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64>
+#     toy.return %2 : tensor<*xf64>
+#   }
+#   toy.func @main() {
+#     %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+#     %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+#     %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+#     %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64>
+#     %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64>
+#     toy.print %4 : tensor<*xf64>
+#     toy.return
+#   }
+# }
+```
+
+```bash
+$ ./build/Ch8/mlir-example-ch8 Ch8/matmul.toy -emit=jit
+# 14.000000 32.000000
+# 32.000000 77.000000
+```
+
+### Transform Dialect
+
+Please flow the [mlir-transform-tutorial](https://mlir.llvm.org/docs/Tutorials/transform/). If you have some questions about the way to run these examples, please check the top lines of each mlir files.
+
+- transform Ch2
+
+```bash
+$ ./build/transform_Ch2/transform-opt-ch2 --transform-interpreter transform_Ch2/ops.mlir
+# module {
+#   func.func private @orig()
+#   func.func private @updated()
+#   func.func @test() {
+#     call @updated() : () -> () # <---- This will be changed to @updated from @orig
+#     return
+#   }
+#   module attributes {transform.with_named_sequence} {
+#     transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+#       %0 = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+#       transform.my.change_call_target %0, "updated" : !transform.any_op
+#       transform.yield
+#     }
+#   }
+# }
+```
+
+- transform Ch3
+
+```bash
+$ ./build/transform_Ch3/transform-opt-ch3 --transform-interpreter transform_Ch3/ops.mlir --allow-unregistered-dialect --split-input-file
+# module {
+#   func.func private @orig()
+#   func.func private @updated()
+#   func.func @test1() {
+#     call @updated() : () -> ()
+#     return
+#   }
+#   module attributes {transform.with_named_sequence} {
+#     transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+#       %0 = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call">
+#       transform.my.change_call_target %0, "updated" : !transform.op<"func.call">
+#       transform.yield 
+#     }
+#   }
+# }
+
+# // -----
+# module {
+#   func.func private @orig()
+#   func.func @test2() {
+#     "my.mm4"() : () -> ()
+#     return
+#   }
+#   module attributes {transform.with_named_sequence} {
+#     transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+#       %0 = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface
+#       %1 = transform.my.call_to_op %0 : (!transform.my.call_op_interface) -> !transform.any_op
+#       transform.yield 
+#     }
+#   }
+# }
+```
diff --git a/mlir/cuda-tile/Toy/CMakeLists.txt b/mlir/cuda-tile/Toy/CMakeLists.txt
new file mode 100644
index 0000000..0f4f6b9
--- /dev/null
+++ b/mlir/cuda-tile/Toy/CMakeLists.txt
@@ -0,0 +1,70 @@
+# For a better template to copy, see examples/standalone
+include_directories(include)
+add_subdirectory(include)
+
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  nativecodegen
+  OrcJIT
+  )
+
+set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+mlir_tablegen(ToyCombine.inc -gen-rewriters)
+add_public_tablegen_target(ToyCudaCombineIncGen)
+
+add_subdirectory(cuda_wrapper)
+
+add_executable(
+  toy-cuda
+  toyc.cpp
+  parser/AST.cpp
+  mlir/MLIRGen.cpp
+  mlir/Dialect.cpp
+  mlir/LowerToAffineLoops.cpp
+  mlir/LowerToLLVM.cpp
+  mlir/ShapeInferencePass.cpp
+  mlir/ToyCombine.cpp
+  mlir/LowerToGpu.cpp
+  mlir/LowerToCudaTile.cpp
+  mlir/EmitCudaTile.cpp
+  )
+
+add_dependencies(toy-cuda
+  ToyCudaShapeInferenceInterfaceIncGen
+  ToyCudaOpsIncGen
+  ToyCudaCombineIncGen
+  )
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+target_link_directories(toy-cuda PRIVATE ${CUDA_TILE_BINARY_DIR}/lib)
+
+target_link_libraries(toy-cuda
+  PRIVATE
+    MLIRAnalysis
+    MLIRBuiltinToLLVMIRTranslation
+    MLIRCallInterfaces
+    MLIRCastInterfaces
+    MLIRExecutionEngine
+    MLIRFunctionInterfaces
+    MLIRIR
+    MLIRLLVMCommonConversion
+    MLIRLLVMDialect
+    MLIRLLVMToLLVMIRTranslation
+    MLIRMemRefDialect
+    MLIRParser
+    MLIRPass
+    MLIRRegisterAllDialects
+    MLIRRegisterAllExtensions
+    MLIRRegisterAllPasses
+    MLIRSideEffectInterfaces
+    MLIRSupport
+    MLIRTargetLLVMIRExport
+    MLIRTransforms
+    CudaTileDialect
+    CudaTileTransforms
+    CudaTileBytecodeWriter
+    CudaTileBytecodeCommon
+    cuda_shim
+  )
diff --git a/mlir/cuda-tile/Toy/cuda_wrapper/CMakeLists.txt b/mlir/cuda-tile/Toy/cuda_wrapper/CMakeLists.txt
new file mode 100644
index 0000000..39b0a8a
--- /dev/null
+++ b/mlir/cuda-tile/Toy/cuda_wrapper/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_library(cuda_shim STATIC cuda_shim.cpp)
+target_include_directories(cuda_shim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+target_link_libraries(cuda_shim PRIVATE CUDA::cuda_driver CUDA::cudart_static)
diff --git a/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp b/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp
new file mode 100644
index 0000000..3b5d351
--- /dev/null
+++ b/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp
@@ -0,0 +1,528 @@
+//===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the CUDA library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdlib>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include "cuda.h"
+#include "cuda_bf16.h"
+#include "cuda_fp16.h"
+#include <vector>
+
+// We assume the program runs on the linux platform if not on Windows.
+// Copy from
+// third_party/llvm-project/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+
+#if CUDA_VERSION >= 13000
+
+#define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default")))
+
+#define CUDA_REPORT_IF_ERROR(expr)                                             \
+  [](CUresult result) {                                                        \
+    if (!result)                                                               \
+      return;                                                                  \
+    const char *name = nullptr;                                                \
+    cuGetErrorName(result, &name);                                             \
+    if (!name)                                                                 \
+      name = "<unknown>";                                                      \
+    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
+  }(expr)
+
+thread_local static int32_t defaultDevice = 0;
+
+/// Helper method that checks environment value for debugging.
+static bool isDebugEnabled() {
+  const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG";
+  static bool isEnabled = getenv(kDebugEnvironmentVariable) != nullptr;
+  return isEnabled;
+}
+
+#define debug_print(fmt, ...)                                                  \
+  do {                                                                         \
+    if (isDebugEnabled())                                                      \
+      fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \
+              __func__, __VA_ARGS__);                                          \
+  } while (0)
+
+// Returns default CUdevice
+static CUdevice getDefaultCuDevice() {
+  CUdevice device;
+  CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+  return device;
+}
+
+// Make the primary context of the current default device current for the
+// duration
+//  of the instance and restore the previous context on destruction.
+class ScopedContext {
+public:
+  ScopedContext() {
+    // Static reference to CUDA primary context for device ordinal
+    // defaultDevice.
+    static CUcontext context = [] {
+      CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
+      CUcontext ctx;
+      // Note: this does not affect the current context.
+      CUDA_REPORT_IF_ERROR(
+          cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
+      return ctx;
+    }();
+
+    CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
+  }
+
+  ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
+};
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
+mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
+  ScopedContext scopedContext;
+  CUmodule module = nullptr;
+  CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
+  return module;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
+                                                                int optLevel) {
+  ScopedContext scopedContext;
+  CUmodule module = nullptr;
+  char jitErrorBuffer[4096] = {0};
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+                               CU_JIT_OPTIMIZATION_LEVEL};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
+                            reinterpret_cast<void *>(optLevel)};
+
+  CUresult result =
+      cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
+  if (result) {
+    fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer);
+    CUDA_REPORT_IF_ERROR(result);
+  }
+  return module;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
+  CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction
+mgpuModuleGetFunction(CUmodule module, const char *name) {
+  CUfunction function = nullptr;
+  CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
+  return function;
+}
+
+// The wrapper uses intptr_t instead of CUDA's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
+                 intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+                 intptr_t blockZ, int32_t smem, CUstream stream, void **params,
+                 void **extra, size_t /*paramsCount*/) {
+  ScopedContext scopedContext;
+  if (smem > 0) {
+    // Avoid checking driver as it's more expensive than if statement
+    int32_t maxShmem = 0;
+    CUdevice device = getDefaultCuDevice();
+    CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+    CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
+        &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+        device));
+    if (maxShmem < smem) {
+      fprintf(stderr,
+              "Requested shared memory (%dkb) is larger than maximum allowed "
+              "shared memory (%dkb) for this device\n",
+              smem, maxShmem);
+    }
+    CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
+        function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
+  }
+  debug_print("Launching kernel, grid=%ld,%ld,%ld, "
+              "threads: %ld, %ld, %ld, "
+              "smem: %dkb\n",
+              gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
+  CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
+                                      blockY, blockZ, smem, stream, params,
+                                      extra));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
+  ScopedContext scopedContext;
+  CUstream stream = nullptr;
+  CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+  return stream;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuStreamSynchronize(CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream,
+                                                              CUevent event) {
+  CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() {
+  ScopedContext scopedContext;
+  CUevent event = nullptr;
+  CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
+  return event;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) {
+  CUDA_REPORT_IF_ERROR(cuEventDestroy(event));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) {
+  CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event,
+                                                          CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) {
+  ScopedContext scopedContext;
+  CUdeviceptr ptr = 0;
+  if (sizeBytes == 0)
+    return reinterpret_cast<void *>(ptr);
+
+  if (isHostShared) {
+    CUDA_REPORT_IF_ERROR(
+        cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL));
+    return reinterpret_cast<void *>(ptr);
+  }
+  CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
+  return reinterpret_cast<void *>(ptr);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr,
+                                                      CUstream /*stream*/) {
+  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemcpy(void *dst, void *src, size_t sizeBytes, CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst),
+                                     reinterpret_cast<CUdeviceptr>(src),
+                                     sizeBytes, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemset32(void *dst, unsigned int value, size_t count, CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst),
+                                        value, count, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemset16(void *dst, unsigned short value, size_t count, CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast<CUdeviceptr>(dst),
+                                        value, count, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
+  defaultDevice = device;
+}
+
+// ===----------------------------------------------------------------------===//
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCtxSynchronize() {
+  ScopedContext scopedContext;
+  CUDA_REPORT_IF_ERROR(cuCtxSynchronize());
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyHtoD(void *dst, void *src,
+                                                         size_t sizeBytes) {
+  CUDA_REPORT_IF_ERROR(
+      cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(dst), src, sizeBytes));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyDtoH(void *dst, void *src,
+                                                         size_t sizeBytes) {
+  CUDA_REPORT_IF_ERROR(
+      cuMemcpyDtoH(dst, reinterpret_cast<CUdeviceptr>(src), sizeBytes));
+}
+
+//===----------------------------------------------------------------------===//
+
+static inline CUdeviceptr asDevPtr(uint64_t h) {
+  return static_cast<CUdeviceptr>(h);
+}
+static inline uint64_t asHandle(CUdeviceptr p) {
+  return static_cast<uint64_t>(p);
+}
+
+static inline CUstream asStream(uint64_t h) {
+  return reinterpret_cast<CUstream>(static_cast<uintptr_t>(h));
+}
+static inline uint64_t asStreamHandle(CUstream s) {
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(s));
+}
+
+static inline CUevent asEvent(uint64_t h) {
+  return reinterpret_cast<CUevent>(static_cast<uintptr_t>(h));
+}
+static inline uint64_t asEventHandle(CUevent e) {
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(e));
+}
+
+static inline void *asHostPtr(uint64_t h) {
+  return reinterpret_cast<void *>(static_cast<uintptr_t>(h));
+}
+static inline const void *asHostCPtr(uint64_t h) {
+  return reinterpret_cast<const void *>(static_cast<uintptr_t>(h));
+}
+
+// Align up helper
+static inline uint64_t alignUp(uint64_t x, uint64_t a) {
+  return (x + (a - 1)) & ~(a - 1);
+}
+
+// Load module from PTX or CUBIN image in memory.
+// Driver API supports cuModuleLoadDataEx for both PTX and cubin (it
+// auto-detects).
+extern "C" uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr,
+                                                     uint64_t image_nbytes) {
+
+  (void)image_nbytes;
+  auto data = const_cast<void *>(asHostCPtr(image_ptr));
+  CUmodule mod = mgpuModuleLoad(data, image_nbytes);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(mod));
+}
+
+extern "C" uint64_t cuda_shim_load_module_jit_from_image(uint64_t image_ptr,
+                                                         uint64_t image_nbytes,
+                                                         int opt_level) {
+
+  (void)image_nbytes;
+  auto data = const_cast<void *>(asHostCPtr(image_ptr));
+  CUmodule mod = mgpuModuleLoadJIT(data, opt_level);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(mod));
+}
+
+extern "C" uint64_t
+cuda_shim_load_module_from_file(uint64_t file_path_ptr,
+                                uint64_t /*file_path_nbytes*/) {
+  auto file_path_cstr =
+      reinterpret_cast<const char *>(asHostCPtr(file_path_ptr));
+  // fprintf(stdout, "%s", file_path_cstr);
+  CUmodule module = nullptr;
+  ScopedContext scopedContext;
+  CUDA_REPORT_IF_ERROR(cuModuleLoad(&module, file_path_cstr));
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(module));
+}
+
+extern "C" void cuda_shim_unload_module(uint64_t module_handle) {
+  CUmodule module =
+      reinterpret_cast<CUmodule>(static_cast<uintptr_t>(module_handle));
+  mgpuModuleUnload(module);
+}
+
+extern "C" uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream,
+                                     bool is_host_shared) {
+  CUstream cu_stream = asStream(stream);
+  if (stream == 0)
+    cu_stream = nullptr;
+  void *ptr = mgpuMemAlloc(nbytes, /*stream=*/cu_stream,
+                           /*isHostShared=*/is_host_shared);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ptr));
+}
+
+extern "C" void cuda_shim_free(uint64_t dptr, uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  if (stream == 0) {
+    cu_stream = nullptr;
+  }
+  mgpuMemFree(ptr, /*stream=*/cu_stream);
+}
+
+extern "C" void cuda_shim_memset32(uint64_t dptr, uint32_t value,
+                                   uint64_t count_dwords, uint64_t stream) {
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  CUstream cu_stream = asStream(stream);
+  mgpuMemset32(ptr, value, count_dwords, cu_stream);
+}
+
+extern "C" void cuda_shim_memset16(uint64_t dptr, uint32_t value,
+                                   uint64_t count_dwords, uint64_t stream) {
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  CUstream cu_stream = asStream(stream);
+  mgpuMemset16(ptr, value, count_dwords, cu_stream);
+}
+
+extern "C" uint64_t cuda_shim_stream_create(void) {
+  CUstream stream = mgpuStreamCreate();
+  return asStreamHandle(stream);
+}
+
+extern "C" void cuda_shim_stream_destroy(uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  mgpuStreamDestroy(cu_stream);
+}
+
+extern "C" void cuda_shim_stream_synchronize(uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  mgpuStreamSynchronize(cu_stream);
+}
+
+extern "C" uint64_t cuda_shim_event_create(void) {
+  CUevent event = mgpuEventCreate();
+  return asEventHandle(event);
+}
+
+extern "C" void cuda_shim_event_destroy(uint64_t ev) {
+  CUevent event = asEvent(ev);
+  mgpuEventDestroy(event);
+}
+
+extern "C" void cuda_shim_event_record(uint64_t ev, uint64_t stream) {
+  CUevent event = asEvent(ev);
+  CUstream cu_stream = asStream(stream);
+  mgpuEventRecord(event, cu_stream);
+}
+
+extern "C" void cuda_shim_event_synchronize(uint64_t ev) {
+  CUevent event = asEvent(ev);
+  mgpuEventSynchronize(event);
+}
+
+extern "C" void cuda_shim_stream_wait_event(uint64_t stream, uint64_t ev) {
+  CUstream cu_stream = asStream(stream);
+  CUevent event = asEvent(ev);
+  mgpuStreamWaitEvent(cu_stream, event);
+}
+
+// ----------------------------- Memcpy (raw ABI) --------------------------
+// Host pointers are passed as uint64_t. This is the key of 2A.
+
+extern "C" void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr,
+                                     uint64_t nbytes) {
+  ScopedContext scopedContext;
+  auto dst = asHostPtr(dst_dptr);
+  auto src = asHostPtr(src_hptr);
+  mgpuMemcpyHtoD(dst, src, static_cast<size_t>(nbytes));
+}
+
+extern "C" void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr,
+                                     uint64_t nbytes) {
+  ScopedContext scopedContext;
+  auto dst = asHostPtr(dst_hptr);
+  auto src = asHostPtr(src_dptr);
+  mgpuMemcpyDtoH(dst, src, static_cast<size_t>(nbytes));
+}
+
+// ----------------------------- Kernel launch -----------------------------
+// The hardest part is kernelParams (void**).
+// We avoid building it in MLIR. Instead MLIR passes:
+// - arg_data_ptr: host pointer to a packed buffer containing raw argument bytes
+// - arg_sizes_ptr: host pointer to uint64_t[num_args], each is the byte-size of
+// that argument The shim constructs kernelParams[i] = &arg_data[offset_i] with
+// 8-byte alignment. This matches typical ABI expectations for scalar/pointer
+// args. If you have special alignment requirements, extend this (e.g., per-arg
+// alignment array).
+
+extern "C" void cuda_shim_launch_packed(
+    uint64_t module_handle, uint64_t kernel_name_ptr, uint32_t gridX,
+    uint32_t gridY, uint32_t gridZ, uint32_t blockX, uint32_t blockY,
+    uint32_t blockZ, uint32_t sharedMemBytes, uint64_t stream,
+    uint64_t arg_data_ptr, uint64_t arg_sizes_ptr, uint32_t num_args) {
+
+  auto mh = reinterpret_cast<CUmodule>(static_cast<uintptr_t>(module_handle));
+  if (!mh) {
+    fprintf(stderr, "[cuda_shim] launch_packed: invalid module handle\n");
+    abort();
+  }
+
+  const char *kname =
+      reinterpret_cast<const char *>(asHostCPtr(kernel_name_ptr));
+  if (!kname) {
+    fprintf(stderr, "[cuda_shim] launch_packed: null kernel name\n");
+    abort();
+  }
+
+  CUfunction fn = mgpuModuleGetFunction(mh, kname);
+
+  auto *argData = reinterpret_cast<uint8_t *>(asHostPtr(arg_data_ptr));
+  auto *argSizes =
+      reinterpret_cast<const uint64_t *>(asHostCPtr(arg_sizes_ptr));
+
+  if (num_args > 0 && (!argData || !argSizes)) {
+    fprintf(stderr, "[cuda_shim] launch_packed: argData/argSizes null\n");
+    abort();
+  }
+
+  // Build kernelParams array on heap (safe for large num_args).
+  std::vector<void *> params;
+  params.resize(num_args);
+
+  uint64_t off = 0;
+  for (uint32_t i = 0; i < num_args; ++i) {
+    // 8-byte align each argument start (common safe default).
+    off = alignUp(off, 8);
+    params[i] = argData + off;
+    off += argSizes[i];
+  }
+
+  auto cu_stream = asStream(stream);
+
+  if (stream == 0) {
+    cu_stream = nullptr;
+  }
+
+  mgpuLaunchKernel(fn, static_cast<intptr_t>(gridX),
+                   static_cast<intptr_t>(gridY), static_cast<intptr_t>(gridZ),
+                   static_cast<intptr_t>(blockX), static_cast<intptr_t>(blockY),
+                   static_cast<intptr_t>(blockZ),
+                   static_cast<int32_t>(sharedMemBytes), cu_stream,
+                   params.data(), nullptr, static_cast<size_t>(num_args));
+}
+
+// Convenience: 1D launch, shared=0, stream optional
+extern "C" void
+cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
+                              uint32_t blockX, uint32_t blockY, uint32_t blockZ,
+                              uint64_t stream, uint64_t arg_data_ptr,
+                              uint64_t arg_sizes_ptr, uint32_t num_args) {
+  cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX,
+                          blockY, blockZ, 0, stream, arg_data_ptr,
+                          arg_sizes_ptr, num_args);
+}
+
+// Optional: global sync (avoid in async pipeline; prefer event/stream sync)
+extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); }
+
+// only for debugging
+// extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) {
+//   auto *p = reinterpret_cast<const float *>(static_cast<uintptr_t>(dptr));
+//   for (uint32_t i = 0; i < n; ++i) {
+//     fprintf(stderr, "i=%u v=%f\n", i, p[i]);
+//   }
+// }
+
+#endif
diff --git a/mlir/cuda-tile/Toy/include/CMakeLists.txt b/mlir/cuda-tile/Toy/include/CMakeLists.txt
new file mode 100644
index 0000000..37c89d0
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(toy)
diff --git a/mlir/cuda-tile/Toy/include/toy/AST.h b/mlir/cuda-tile/Toy/include/toy/AST.h
new file mode 100644
index 0000000..d2ba101
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/AST.h
@@ -0,0 +1,246 @@
+//===- AST.h - Node definition for the Toy AST ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST for the Toy language. It is optimized for
+// simplicity, not efficiency. The AST forms a tree structure where each node
+// references its children using std::unique_ptr<>.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_AST_H
+#define TOY_AST_H
+
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <utility>
+#include <vector>
+#include <optional>
+
+namespace toy {
+
+/// A variable type with shape information.
+struct VarType {
+  std::vector<int64_t> shape;
+};
+
+/// Base class for all expression nodes.
+class ExprAST {
+public:
+  enum ExprASTKind {
+    Expr_VarDecl,
+    Expr_Return,
+    Expr_Num,
+    Expr_Literal,
+    Expr_Var,
+    Expr_BinOp,
+    Expr_Call,
+    Expr_Print,
+  };
+
+  ExprAST(ExprASTKind kind, Location location)
+      : kind(kind), location(std::move(location)) {}
+  virtual ~ExprAST() = default;
+
+  ExprASTKind getKind() const { return kind; }
+
+  const Location &loc() { return location; }
+
+private:
+  const ExprASTKind kind;
+  Location location;
+};
+
+/// A block-list of expressions.
+using ExprASTList = std::vector<std::unique_ptr<ExprAST>>;
+
+/// Expression class for numeric literals like "1.0".
+class NumberExprAST : public ExprAST {
+  double val;
+
+public:
+  NumberExprAST(Location loc, double val)
+      : ExprAST(Expr_Num, std::move(loc)), val(val) {}
+
+  double getValue() { return val; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; }
+};
+
+/// Expression class for a literal value.
+class LiteralExprAST : public ExprAST {
+  std::vector<std::unique_ptr<ExprAST>> values;
+  std::vector<int64_t> dims;
+
+public:
+  LiteralExprAST(Location loc, std::vector<std::unique_ptr<ExprAST>> values,
+                 std::vector<int64_t> dims)
+      : ExprAST(Expr_Literal, std::move(loc)), values(std::move(values)),
+        dims(std::move(dims)) {}
+
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getValues() { return values; }
+  llvm::ArrayRef<int64_t> getDims() { return dims; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; }
+};
+
+/// Expression class for referencing a variable, like "a".
+class VariableExprAST : public ExprAST {
+  std::string name;
+
+public:
+  VariableExprAST(Location loc, llvm::StringRef name)
+      : ExprAST(Expr_Var, std::move(loc)), name(name) {}
+
+  llvm::StringRef getName() { return name; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; }
+};
+
+/// Expression class for defining a variable.
+class VarDeclExprAST : public ExprAST {
+  std::string name;
+  VarType type;
+  std::unique_ptr<ExprAST> initVal;
+
+public:
+  VarDeclExprAST(Location loc, llvm::StringRef name, VarType type,
+                 std::unique_ptr<ExprAST> initVal)
+      : ExprAST(Expr_VarDecl, std::move(loc)), name(name),
+        type(std::move(type)), initVal(std::move(initVal)) {}
+
+  llvm::StringRef getName() { return name; }
+  ExprAST *getInitVal() { return initVal.get(); }
+  const VarType &getType() { return type; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; }
+};
+
+/// Expression class for a return operator.
+class ReturnExprAST : public ExprAST {
+  std::optional<std::unique_ptr<ExprAST>> expr;
+
+public:
+  ReturnExprAST(Location loc, std::optional<std::unique_ptr<ExprAST>> expr)
+      : ExprAST(Expr_Return, std::move(loc)), expr(std::move(expr)) {}
+
+  std::optional<ExprAST *> getExpr() {
+    if (expr.has_value())
+      return expr->get();
+    return std::nullopt;
+  }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; }
+};
+
+/// Expression class for a binary operator.
+class BinaryExprAST : public ExprAST {
+  char op;
+  std::unique_ptr<ExprAST> lhs, rhs;
+
+public:
+  char getOp() { return op; }
+  ExprAST *getLHS() { return lhs.get(); }
+  ExprAST *getRHS() { return rhs.get(); }
+
+  BinaryExprAST(Location loc, char op, std::unique_ptr<ExprAST> lhs,
+                std::unique_ptr<ExprAST> rhs)
+      : ExprAST(Expr_BinOp, std::move(loc)), op(op), lhs(std::move(lhs)),
+        rhs(std::move(rhs)) {}
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; }
+};
+
+/// Expression class for function calls.
+class CallExprAST : public ExprAST {
+  std::string callee;
+  std::vector<std::unique_ptr<ExprAST>> args;
+
+public:
+  CallExprAST(Location loc, const std::string &callee,
+              std::vector<std::unique_ptr<ExprAST>> args)
+      : ExprAST(Expr_Call, std::move(loc)), callee(callee),
+        args(std::move(args)) {}
+
+  llvm::StringRef getCallee() { return callee; }
+  llvm::ArrayRef<std::unique_ptr<ExprAST>> getArgs() { return args; }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; }
+};
+
+/// Expression class for builtin print calls.
+class PrintExprAST : public ExprAST {
+  std::unique_ptr<ExprAST> arg;
+
+public:
+  PrintExprAST(Location loc, std::unique_ptr<ExprAST> arg)
+      : ExprAST(Expr_Print, std::move(loc)), arg(std::move(arg)) {}
+
+  ExprAST *getArg() { return arg.get(); }
+
+  /// LLVM style RTTI
+  static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; }
+};
+
+/// This class represents the "prototype" for a function, which captures its
+/// name, and its argument names (thus implicitly the number of arguments the
+/// function takes).
+class PrototypeAST {
+  Location location;
+  std::string name;
+  std::vector<std::unique_ptr<VariableExprAST>> args;
+
+public:
+  PrototypeAST(Location location, const std::string &name,
+               std::vector<std::unique_ptr<VariableExprAST>> args)
+      : location(std::move(location)), name(name), args(std::move(args)) {}
+
+  const Location &loc() { return location; }
+  llvm::StringRef getName() const { return name; }
+  llvm::ArrayRef<std::unique_ptr<VariableExprAST>> getArgs() { return args; }
+};
+
+/// This class represents a function definition itself.
+class FunctionAST {
+  std::unique_ptr<PrototypeAST> proto;
+  std::unique_ptr<ExprASTList> body;
+
+public:
+  FunctionAST(std::unique_ptr<PrototypeAST> proto,
+              std::unique_ptr<ExprASTList> body)
+      : proto(std::move(proto)), body(std::move(body)) {}
+  PrototypeAST *getProto() { return proto.get(); }
+  ExprASTList *getBody() { return body.get(); }
+};
+
+/// This class represents a list of functions to be processed together
+class ModuleAST {
+  std::vector<FunctionAST> functions;
+
+public:
+  ModuleAST(std::vector<FunctionAST> functions)
+      : functions(std::move(functions)) {}
+
+  auto begin() { return functions.begin(); }
+  auto end() { return functions.end(); }
+};
+
+void dump(ModuleAST &);
+
+} // namespace toy
+
+#endif // TOY_AST_H
diff --git a/mlir/cuda-tile/Toy/include/toy/CMakeLists.txt b/mlir/cuda-tile/Toy/include/toy/CMakeLists.txt
new file mode 100644
index 0000000..58f7e8e
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/CMakeLists.txt
@@ -0,0 +1,13 @@
+# Most dialects should use add_mlir_dialect().  See examples/standalone.
+set(LLVM_TARGET_DEFINITIONS Ops.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
+mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
+add_public_tablegen_target(ToyCudaOpsIncGen)
+
+# Most dialects should use add_mlir_interfaces().
+set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td)
+mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(ToyCudaShapeInferenceInterfaceIncGen)
diff --git a/mlir/cuda-tile/Toy/include/toy/Dialect.h b/mlir/cuda-tile/Toy/include/toy/Dialect.h
new file mode 100644
index 0000000..5db325e
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/Dialect.h
@@ -0,0 +1,36 @@
+//===- Dialect.h - Dialect definition for the Toy IR ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IR Dialect for the Toy language.
+// See docs/Tutorials/Toy/Ch-2.md for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_
+#define MLIR_TUTORIAL_TOY_DIALECT_H_
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/CastInterfaces.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "toy/ShapeInferenceInterface.h"
+
+/// Include the auto-generated header file containing the declaration of the toy
+/// dialect.
+#include "toy/Dialect.h.inc"
+
+/// Include the auto-generated header file containing the declarations of the
+/// toy operations.
+#define GET_OP_CLASSES
+#include "toy/Ops.h.inc"
+
+#endif // MLIR_TUTORIAL_TOY_DIALECT_H_
diff --git a/mlir/cuda-tile/Toy/include/toy/Lexer.h b/mlir/cuda-tile/Toy/include/toy/Lexer.h
new file mode 100644
index 0000000..22822cc
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/Lexer.h
@@ -0,0 +1,233 @@
+//===- Lexer.h - Lexer for the Toy language -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple Lexer for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_LEXER_H
+#define TOY_LEXER_H
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+
+namespace toy {
+
+/// Structure definition a location in a file.
+struct Location {
+  std::shared_ptr<std::string> file; ///< filename.
+  int line;                          ///< line number.
+  int col;                           ///< column number.
+};
+
+// List of Token returned by the lexer.
+enum Token : int {
+  tok_semicolon = ';',
+  tok_parenthese_open = '(',
+  tok_parenthese_close = ')',
+  tok_bracket_open = '{',
+  tok_bracket_close = '}',
+  tok_sbracket_open = '[',
+  tok_sbracket_close = ']',
+
+  tok_eof = -1,
+
+  // commands
+  tok_return = -2,
+  tok_var = -3,
+  tok_def = -4,
+
+  // primary
+  tok_identifier = -5,
+  tok_number = -6,
+};
+
+/// The Lexer is an abstract base class providing all the facilities that the
+/// Parser expects. It goes through the stream one token at a time and keeps
+/// track of the location in the file for debugging purpose.
+/// It relies on a subclass to provide a `readNextLine()` method. The subclass
+/// can proceed by reading the next line from the standard input or from a
+/// memory mapped file.
+class Lexer {
+public:
+  /// Create a lexer for the given filename. The filename is kept only for
+  /// debugging purpose (attaching a location to a Token).
+  Lexer(std::string filename)
+      : lastLocation(
+            {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
+  virtual ~Lexer() = default;
+
+  /// Look at the current token in the stream.
+  Token getCurToken() { return curTok; }
+
+  /// Move to the next token in the stream and return it.
+  Token getNextToken() { return curTok = getTok(); }
+
+  /// Move to the next token in the stream, asserting on the current token
+  /// matching the expectation.
+  void consume(Token tok) {
+    assert(tok == curTok && "consume Token mismatch expectation");
+    getNextToken();
+  }
+
+  /// Return the current identifier (prereq: getCurToken() == tok_identifier)
+  llvm::StringRef getId() {
+    assert(curTok == tok_identifier);
+    return identifierStr;
+  }
+
+  /// Return the current number (prereq: getCurToken() == tok_number)
+  double getValue() {
+    assert(curTok == tok_number);
+    return numVal;
+  }
+
+  /// Return the location for the beginning of the current token.
+  Location getLastLocation() { return lastLocation; }
+
+  // Return the current line in the file.
+  int getLine() { return curLineNum; }
+
+  // Return the current column in the file.
+  int getCol() { return curCol; }
+
+private:
+  /// Delegate to a derived class fetching the next line. Returns an empty
+  /// string to signal end of file (EOF). Lines are expected to always finish
+  /// with "\n"
+  virtual llvm::StringRef readNextLine() = 0;
+
+  /// Return the next character from the stream. This manages the buffer for the
+  /// current line and request the next line buffer to the derived class as
+  /// needed.
+  int getNextChar() {
+    // The current line buffer should not be empty unless it is the end of file.
+    if (curLineBuffer.empty())
+      return EOF;
+    ++curCol;
+    auto nextchar = curLineBuffer.front();
+    curLineBuffer = curLineBuffer.drop_front();
+    if (curLineBuffer.empty())
+      curLineBuffer = readNextLine();
+    if (nextchar == '\n') {
+      ++curLineNum;
+      curCol = 0;
+    }
+    return nextchar;
+  }
+
+  ///  Return the next token from standard input.
+  Token getTok() {
+    // Skip any whitespace.
+    while (isspace(lastChar))
+      lastChar = Token(getNextChar());
+
+    // Save the current location before reading the token characters.
+    lastLocation.line = curLineNum;
+    lastLocation.col = curCol;
+
+    // Identifier: [a-zA-Z][a-zA-Z0-9_]*
+    if (isalpha(lastChar)) {
+      identifierStr = (char)lastChar;
+      while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
+        identifierStr += (char)lastChar;
+
+      if (identifierStr == "return")
+        return tok_return;
+      if (identifierStr == "def")
+        return tok_def;
+      if (identifierStr == "var")
+        return tok_var;
+      return tok_identifier;
+    }
+
+    // Number: [0-9.]+
+    if (isdigit(lastChar) || lastChar == '.') {
+      std::string numStr;
+      do {
+        numStr += lastChar;
+        lastChar = Token(getNextChar());
+      } while (isdigit(lastChar) || lastChar == '.');
+
+      numVal = strtod(numStr.c_str(), nullptr);
+      return tok_number;
+    }
+
+    if (lastChar == '#') {
+      // Comment until end of line.
+      do {
+        lastChar = Token(getNextChar());
+      } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
+
+      if (lastChar != EOF)
+        return getTok();
+    }
+
+    // Check for end of file.  Don't eat the EOF.
+    if (lastChar == EOF)
+      return tok_eof;
+
+    // Otherwise, just return the character as its ascii value.
+    Token thisChar = Token(lastChar);
+    lastChar = Token(getNextChar());
+    return thisChar;
+  }
+
+  /// The last token read from the input.
+  Token curTok = tok_eof;
+
+  /// Location for `curTok`.
+  Location lastLocation;
+
+  /// If the current Token is an identifier, this string contains the value.
+  std::string identifierStr;
+
+  /// If the current Token is a number, this contains the value.
+  double numVal = 0;
+
+  /// The last value returned by getNextChar(). We need to keep it around as we
+  /// always need to read ahead one character to decide when to end a token and
+  /// we can't put it back in the stream after reading from it.
+  Token lastChar = Token(' ');
+
+  /// Keep track of the current line number in the input stream
+  int curLineNum = 0;
+
+  /// Keep track of the current column number in the input stream
+  int curCol = 0;
+
+  /// Buffer supplied by the derived class on calls to `readNextLine()`
+  llvm::StringRef curLineBuffer = "\n";
+};
+
+/// A lexer implementation operating on a buffer in memory.
+class LexerBuffer final : public Lexer {
+public:
+  LexerBuffer(const char *begin, const char *end, std::string filename)
+      : Lexer(std::move(filename)), current(begin), end(end) {}
+
+private:
+  /// Provide one line at a time to the Lexer, return an empty string when
+  /// reaching the end of the buffer.
+  llvm::StringRef readNextLine() override {
+    auto *begin = current;
+    while (current <= end && *current && *current != '\n')
+      ++current;
+    if (current <= end && *current)
+      ++current;
+    llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
+    return result;
+  }
+  const char *current, *end;
+};
+} // namespace toy
+
+#endif // TOY_LEXER_H
diff --git a/mlir/cuda-tile/Toy/include/toy/MLIRGen.h b/mlir/cuda-tile/Toy/include/toy/MLIRGen.h
new file mode 100644
index 0000000..fe9dbe5
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/MLIRGen.h
@@ -0,0 +1,35 @@
+//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a simple interface to perform IR generation targeting MLIR
+// from a Module AST for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_MLIRGEN_H
+#define TOY_MLIRGEN_H
+
+#include <memory>
+
+namespace mlir {
+class MLIRContext;
+template <typename OpTy>
+class OwningOpRef;
+class ModuleOp;
+} // namespace mlir
+
+namespace toy {
+class ModuleAST;
+
+/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module
+/// or nullptr on failure.
+mlir::OwningOpRef<mlir::ModuleOp> mlirGen(mlir::MLIRContext &context,
+                                          ModuleAST &moduleAST);
+} // namespace toy
+
+#endif // TOY_MLIRGEN_H
diff --git a/mlir/cuda-tile/Toy/include/toy/Ops.td b/mlir/cuda-tile/Toy/include/toy/Ops.td
new file mode 100644
index 0000000..5aa524c
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/Ops.td
@@ -0,0 +1,498 @@
+//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_OPS
+#define TOY_OPS
+
+include "mlir/Interfaces/FunctionInterfaces.td"
+include "mlir/IR/SymbolInterfaces.td"
+include "mlir/Interfaces/CallInterfaces.td"
+include "mlir/Interfaces/CastInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "toy/ShapeInferenceInterface.td"
+
+def F32ElementsAttr : FloatElementsAttr<32>;
+
+// Provide a definition of the 'toy' dialect in the ODS framework so that we
+// can define our operations.
+def Toy_Dialect : Dialect {
+  let name = "toy";
+  let cppNamespace = "::mlir::toy";
+}
+
+// Base class for toy dialect operations. This operation inherits from the base
+// `Op` class in OpBase.td, and provides:
+//   * The parent dialect of the operation.
+//   * The mnemonic for the operation, or the name without the dialect prefix.
+//   * A list of traits for the operation.
+class Toy_Op<string mnemonic, list<Trait> traits = []> :
+    Op<Toy_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+// We define a toy operation by inheriting from our base 'Toy_Op' class above.
+// Here we provide the mnemonic and a list of traits for the operation. The
+// constant operation is marked as 'Pure' as it is a pure operation
+// and may be removed if dead.
+def ConstantOp : Toy_Op<"constant", [Pure]> {
+  // Provide a summary and description for this operation. This can be used to
+  // auto-generate documentation of the operations within our dialect.
+  let summary = "constant";
+  let description = [{
+    Constant operation turns a literal into an SSA value. The data is attached
+    to the operation as an attribute. For example:
+
+    ```mlir
+      %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]>
+                        : tensor<2x3xf32>
+    ```
+  }];
+
+  // The constant operation takes an attribute as the only input.
+  let arguments = (ins F32ElementsAttr:$value);
+
+  // The constant operation returns a single value of TensorType.
+  let results = (outs F32Tensor);
+
+  // Indicate that the operation has a custom parser and printer method.
+  let hasCustomAssemblyFormat = 1;
+
+  // Add custom build methods for the constant operation. These method populates
+  // the `state` that MLIR uses to create operations, i.e. these are used when
+  // using `ConstantOp::create(builder, ...)`.
+  let builders = [
+    // Build a constant with a given constant tensor value.
+    OpBuilder<(ins "DenseElementsAttr":$value), [{
+      build($_builder, $_state, value.getType(), value);
+    }]>,
+
+    // Build a constant with a given constant floating-point value.
+    OpBuilder<(ins "float":$value)>
+  ];
+
+  // Indicate that additional verification for this operation is necessary.
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp
+//===----------------------------------------------------------------------===//
+
+def AddOp : Toy_Op<"add",
+    [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise addition operation";
+  let description = [{
+    The "add" operation performs element-wise addition between two tensors.
+    The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F32Tensor:$lhs, F32Tensor:$rhs);
+  let results = (outs F32Tensor);
+
+  // Indicate that the operation has a custom parser and printer method.
+  let hasCustomAssemblyFormat = 1;
+
+  // Allow building an AddOp with from the two input operands.
+  let builders = [
+    OpBuilder<(ins "Value":$lhs, "Value":$rhs)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// CastOp
+//===----------------------------------------------------------------------===//
+
+def CastOp : Toy_Op<"cast", [
+     DeclareOpInterfaceMethods<CastOpInterface>,
+     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+     Pure,
+     SameOperandsAndResultShape
+  ]> {
+  let summary = "shape cast operation";
+  let description = [{
+    The "cast" operation converts a tensor from one type to an equivalent type
+    without changing any data elements. The source and destination types must
+    both be tensor types with the same element type. If both are ranked, then
+    shape is required to match. The operation is invalid if converting to a
+    mismatching constant dimension.
+  }];
+
+  let arguments = (ins F32Tensor:$input);
+  let results = (outs F32Tensor:$output);
+
+  let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)";
+}
+
+//===----------------------------------------------------------------------===//
+// FuncOp
+//===----------------------------------------------------------------------===//
+
+def FuncOp : Toy_Op<"func", [
+    FunctionOpInterface, IsolatedFromAbove
+  ]> {
+  let summary = "user defined function operation";
+  let description = [{
+    The "toy.func" operation represents a user defined function. These are
+    callable SSA-region operations that contain toy computations.
+
+    Example:
+
+    ```mlir
+    toy.func @main() {
+      %0 = toy.constant dense<5.500000e+00> : tensor<f32>
+      %1 = toy.reshape(%0 : tensor<f32>) to tensor<2x2xf32>
+      toy.print %1 : tensor<2x2xf32>
+      toy.return
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    SymbolNameAttr:$sym_name,
+    TypeAttrOf<FunctionType>:$function_type,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
+  );
+  let regions = (region AnyRegion:$body);
+
+  let builders = [OpBuilder<(ins
+    "StringRef":$name, "FunctionType":$type,
+    CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)
+  >];
+  let extraClassDeclaration = [{
+    //===------------------------------------------------------------------===//
+    // FunctionOpInterface Methods
+    //===------------------------------------------------------------------===//
+
+    /// Returns the argument types of this function.
+    ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
+
+    /// Returns the result types of this function.
+    ArrayRef<Type> getResultTypes() { return getFunctionType().getResults(); }
+
+    /// Returns the region on the function operation that is callable.
+    Region *getCallableRegion() { return &getBody(); }
+  }];
+  let hasCustomAssemblyFormat = 1;
+  let skipDefaultBuilders = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+//===----------------------------------------------------------------------===//
+
+def GenericCallOp : Toy_Op<"generic_call",
+    [DeclareOpInterfaceMethods<CallOpInterface>]> {
+  let summary = "generic call operation";
+  let description = [{
+    Generic calls represent calls to a user defined function that needs to
+    be specialized for the shape of its arguments. The callee name is attached
+    as a symbol reference via an attribute. The arguments list must match the
+    arguments expected by the callee. For example:
+
+    ```mlir
+     %4 = toy.generic_call @my_func(%1, %3)
+           : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+    ```
+
+    This is only valid if a function named "my_func" exists and takes two
+    arguments.
+  }];
+
+  // The generic call operation takes a symbol reference attribute as the
+  // callee, and inputs for the call.
+  let arguments = (ins
+    FlatSymbolRefAttr:$callee,
+    Variadic<F32Tensor>:$inputs,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
+  );
+
+  // The generic call operation returns a single value of TensorType.
+  let results = (outs F32Tensor);
+
+  // Specialize assembly printing and parsing using a declarative format.
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
+  // Add custom build methods for the generic call operation.
+  let builders = [
+    OpBuilder<(ins "StringRef":$callee, "ArrayRef<Value>":$arguments)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// MulOp
+//===----------------------------------------------------------------------===//
+
+def MulOp : Toy_Op<"mul",
+    [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "element-wise multiplication operation";
+  let description = [{
+    The "mul" operation performs element-wise multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F32Tensor:$lhs, F32Tensor:$rhs);
+  let results = (outs F32Tensor);
+
+  // Indicate that the operation has a custom parser and printer method.
+  let hasCustomAssemblyFormat = 1;
+
+  // Allow building a MulOp with from the two input operands.
+  let builders = [
+    OpBuilder<(ins "Value":$lhs, "Value":$rhs)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// PrintOp
+//===----------------------------------------------------------------------===//
+
+def PrintOp : Toy_Op<"print"> {
+  let summary = "print operation";
+  let description = [{
+    The "print" builtin operation prints a given input tensor, and produces
+    no results.
+  }];
+
+  // The print operation takes an input tensor to print.
+  // We also allow a F32MemRef to enable interop during partial lowering.
+  let arguments = (ins AnyTypeOf<[F32Tensor, F32MemRef]>:$input);
+
+  let assemblyFormat = "$input attr-dict `:` type($input)";
+}
+
+//===----------------------------------------------------------------------===//
+// ReshapeOp
+//===----------------------------------------------------------------------===//
+
+def ReshapeOp : Toy_Op<"reshape", [Pure]> {
+  let summary = "tensor reshape operation";
+  let description = [{
+    Reshape operation is transforming its input tensor into a new tensor with
+    the same number of elements but different shapes. For example:
+
+    ```mlir
+       %0 = toy.reshape (%arg1 : tensor<10xf32>) to tensor<5x2xf32>
+    ```
+  }];
+
+  let arguments = (ins F32Tensor:$input);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
+  let hasCanonicalizer = 1;
+
+  // We expect that the reshape operation returns a statically shaped tensor.
+  let results = (outs StaticShapeTensorOf<[F32]>);
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+def ReturnOp : Toy_Op<"return", [Pure, HasParent<"FuncOp, GPUFuncOp">,
+                                 Terminator]> {
+  let summary = "return operation";
+  let description = [{
+    The "return" operation represents a return operation within a function.
+    The operation takes an optional tensor operand and produces no results.
+    The operand type must match the signature of the function that contains
+    the operation. For example:
+
+    ```mlir
+      toy.func @foo() -> tensor<2xf32> {
+        ...
+        toy.return %0 : tensor<2xf32>
+      }
+    ```
+  }];
+
+  // The return operation takes an optional input operand to return. This
+  // value must match the return type of the enclosing function.
+  let arguments = (ins Variadic<F32Tensor>:$input);
+
+  // The return operation only emits the input in the format if it is present.
+  let assemblyFormat = "($input^ `:` type($input))? attr-dict ";
+
+  // Allow building a ReturnOp with no return operand.
+  let builders = [
+    OpBuilder<(ins), [{ build($_builder, $_state, {}); }]>
+  ];
+
+  // Provide extra utility definitions on the c++ operation class definition.
+  let extraClassDeclaration = [{
+    bool hasOperand() { return getNumOperands() != 0; }
+  }];
+
+  // Indicate that additional verification for this operation is necessary.
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+
+def TransposeOp : Toy_Op<"transpose",
+    [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "transpose operation";
+
+  let arguments = (ins F32Tensor:$input);
+  let results = (outs F32Tensor);
+
+  let assemblyFormat = [{
+    `(` $input `:` type($input) `)` attr-dict `to` type(results)
+  }];
+
+  // Enable registering canonicalization patterns with this operation.
+  let hasCanonicalizer = 1;
+
+  // Allow building a TransposeOp with from the input operand.
+  let builders = [
+    OpBuilder<(ins "Value":$input)>
+  ];
+
+  // Indicate that additional verification for this operation is necessary.
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// MatMul Op
+//===----------------------------------------------------------------------===//
+
+def MatMulOp : Toy_Op<"matmul",
+    [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, MemoryEffectsOpInterface]> {
+  let summary = "matrix multiplication operation";
+  let description = [{
+    The "matmul" operation performs Matrix multiplication between two
+    tensors. The shapes of the tensor operands are expected to match.
+  }];
+
+  let arguments = (ins F32Tensor:$lhs, F32Tensor:$rhs);
+  let results = (outs Res<F32Tensor, "",
+                          [MemWrite<DefaultResource>,
+                           MemAlloc<DefaultResource>]>:$output);
+
+  let assemblyFormat = [{
+    `(` $lhs `:` type($lhs) `,` $rhs `:` type($rhs) `)` attr-dict `to` type(results)
+  }];
+
+  // Allow building a MatMulOp with from the two input operands.
+  let builders = [
+    OpBuilder<(ins "Value":$lhs, "Value":$rhs)>
+  ];
+
+  let hasVerifier = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// lauch GPU Op
+//===----------------------------------------------------------------------===//
+
+def LaunchGpuOp : Toy_Op<"launch_gpu",
+    [DeclareOpInterfaceMethods<CallOpInterface>]> {
+  let summary = "launch gpu kernel operation";
+  let description = [{
+    The "launch_gpu" operation launches a GPU kernel with given grid
+    dimensions.
+
+    ```mlir
+     %4 = toy.launch_gpu @my_func(%1, %3) {grid = [16, 16, 1]}
+           : (tensor<2x3xf32>, tensor<2x3xf32>)
+    ```
+
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$callee,
+    Variadic<F32Tensor>:$inputs,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
+  );
+
+  let results = (outs Variadic<F32Tensor>:$results);
+
+  let assemblyFormat = [{
+    $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results)
+  }];
+
+  let builders = [
+    OpBuilder<(ins "StringRef":$callee, "ArrayRef<Value>":$arguments)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// GPUFuncOp
+//===----------------------------------------------------------------------===//
+
+def GPUFuncOp : Toy_Op<"gpu_func", [
+    FunctionOpInterface, IsolatedFromAbove
+  ]> {
+  let summary = "GPU kernel function operation";
+  let description = [{
+    The "toy.gpu_func" operation represents a GPU kernel function. These are
+    callable SSA-region operations that contain toy computations to be run on
+    the GPU.
+
+    Example:
+
+    ```mlir
+    toy.gpu_func @my_kernel(tensor<*xf32> %arg0, tensor<*xf32> %arg1) {
+      ...
+      toy.return
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    SymbolNameAttr:$sym_name,
+    TypeAttrOf<FunctionType>:$function_type,
+    OptionalAttr<DictArrayAttr>:$arg_attrs,
+    OptionalAttr<DictArrayAttr>:$res_attrs
+  );
+
+  let regions = (region AnyRegion:$body);
+
+  let builders = [OpBuilder<(ins
+    "StringRef":$name, "FunctionType":$type,
+    CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)
+  >];
+  let extraClassDeclaration = [{
+    //===------------------------------------------------------------------===//
+    // FunctionOpInterface Methods
+    //===------------------------------------------------------------------===//
+
+    /// Returns the argument types of this function.
+    ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
+
+    /// Returns the result types of this function.
+    ArrayRef<Type> getResultTypes() { return getFunctionType().getResults(); }
+
+    /// Returns the region on the function operation that is callable.
+    Region *getCallableRegion() { return &getBody(); }
+  }];
+  let hasCustomAssemblyFormat = 1;
+  let skipDefaultBuilders = 1;
+}
+
+#endif // TOY_OPS
diff --git a/mlir/cuda-tile/Toy/include/toy/Parser.h b/mlir/cuda-tile/Toy/include/toy/Parser.h
new file mode 100644
index 0000000..1f20616
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/Parser.h
@@ -0,0 +1,489 @@
+//===- Parser.h - Toy Language Parser -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parser for the Toy language. It processes the Token
+// provided by the Lexer and returns an AST.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_PARSER_H
+#define TOY_PARSER_H
+
+#include "toy/AST.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <optional>
+
+namespace toy {
+
+/// This is a simple recursive parser for the Toy language. It produces a well
+/// formed AST from a stream of Token supplied by the Lexer. No semantic checks
+/// or symbol resolution is performed. For example, variables are referenced by
+/// string and the code could reference an undeclared variable and the parsing
+/// succeeds.
+class Parser {
+public:
+  /// Create a Parser for the supplied lexer.
+  Parser(Lexer &lexer) : lexer(lexer) {}
+
+  /// Parse a full Module. A module is a list of function definitions.
+  std::unique_ptr<ModuleAST> parseModule() {
+    lexer.getNextToken(); // prime the lexer
+
+    // Parse functions one at a time and accumulate in this vector.
+    std::vector<FunctionAST> functions;
+    while (auto f = parseDefinition()) {
+      functions.push_back(std::move(*f));
+      if (lexer.getCurToken() == tok_eof)
+        break;
+    }
+    // If we didn't reach EOF, there was an error during parsing
+    if (lexer.getCurToken() != tok_eof)
+      return parseError<ModuleAST>("nothing", "at end of module");
+
+    return std::make_unique<ModuleAST>(std::move(functions));
+  }
+
+private:
+  Lexer &lexer;
+
+  /// Parse a return statement.
+  /// return :== return ; | return expr ;
+  std::unique_ptr<ReturnExprAST> parseReturn() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(tok_return);
+
+    // return takes an optional argument
+    std::optional<std::unique_ptr<ExprAST>> expr;
+    if (lexer.getCurToken() != ';') {
+      expr = parseExpression();
+      if (!expr)
+        return nullptr;
+    }
+    return std::make_unique<ReturnExprAST>(std::move(loc), std::move(expr));
+  }
+
+  /// Parse a literal number.
+  /// numberexpr ::= number
+  std::unique_ptr<ExprAST> parseNumberExpr() {
+    auto loc = lexer.getLastLocation();
+    auto result =
+        std::make_unique<NumberExprAST>(std::move(loc), lexer.getValue());
+    lexer.consume(tok_number);
+    return std::move(result);
+  }
+
+  /// Parse a literal array expression.
+  /// tensorLiteral ::= [ literalList ] | number
+  /// literalList ::= tensorLiteral | tensorLiteral, literalList
+  std::unique_ptr<ExprAST> parseTensorLiteralExpr() {
+    auto loc = lexer.getLastLocation();
+    lexer.consume(Token('['));
+
+    // Hold the list of values at this nesting level.
+    std::vector<std::unique_ptr<ExprAST>> values;
+    // Hold the dimensions for all the nesting inside this level.
+    std::vector<int64_t> dims;
+    do {
+      // We can have either another nested array or a number literal.
+      if (lexer.getCurToken() == '[') {
+        values.push_back(parseTensorLiteralExpr());
+        if (!values.back())
+          return nullptr; // parse error in the nested array.
+      } else {
+        if (lexer.getCurToken() != tok_number)
+          return parseError<ExprAST>("<num> or [", "in literal expression");
+        values.push_back(parseNumberExpr());
+      }
+
+      // End of this list on ']'
+      if (lexer.getCurToken() == ']')
+        break;
+
+      // Elements are separated by a comma.
+      if (lexer.getCurToken() != ',')
+        return parseError<ExprAST>("] or ,", "in literal expression");
+
+      lexer.getNextToken(); // eat ,
+    } while (true);
+    if (values.empty())
+      return parseError<ExprAST>("<something>", "to fill literal expression");
+    lexer.getNextToken(); // eat ]
+
+    /// Fill in the dimensions now. First the current nesting level:
+    dims.push_back(values.size());
+
+    /// If there is any nested array, process all of them and ensure that
+    /// dimensions are uniform.
+    if (llvm::any_of(values, [](std::unique_ptr<ExprAST> &expr) {
+          return llvm::isa<LiteralExprAST>(expr.get());
+        })) {
+      auto *firstLiteral = llvm::dyn_cast<LiteralExprAST>(values.front().get());
+      if (!firstLiteral)
+        return parseError<ExprAST>("uniform well-nested dimensions",
+                                   "inside literal expression");
+
+      // Append the nested dimensions to the current level
+      auto firstDims = firstLiteral->getDims();
+      dims.insert(dims.end(), firstDims.begin(), firstDims.end());
+
+      // Sanity check that shape is uniform across all elements of the list.
+      for (auto &expr : values) {
+        auto *exprLiteral = llvm::cast<LiteralExprAST>(expr.get());
+        if (!exprLiteral)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+        if (exprLiteral->getDims() != firstDims)
+          return parseError<ExprAST>("uniform well-nested dimensions",
+                                     "inside literal expression");
+      }
+    }
+    return std::make_unique<LiteralExprAST>(std::move(loc), std::move(values),
+                                            std::move(dims));
+  }
+
+  /// parenexpr ::= '(' expression ')'
+  std::unique_ptr<ExprAST> parseParenExpr() {
+    lexer.getNextToken(); // eat (.
+    auto v = parseExpression();
+    if (!v)
+      return nullptr;
+
+    if (lexer.getCurToken() != ')')
+      return parseError<ExprAST>(")", "to close expression with parentheses");
+    lexer.consume(Token(')'));
+    return v;
+  }
+
+  /// identifierexpr
+  ///   ::= identifier
+  ///   ::= identifier '(' expression ')'
+  std::unique_ptr<ExprAST> parseIdentifierExpr() {
+    std::string name(lexer.getId());
+
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat identifier.
+
+    if (lexer.getCurToken() != '(') // Simple variable ref.
+      return std::make_unique<VariableExprAST>(std::move(loc), name);
+
+    // This is a function call.
+    lexer.consume(Token('('));
+    std::vector<std::unique_ptr<ExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      while (true) {
+        if (auto arg = parseExpression())
+          args.push_back(std::move(arg));
+        else
+          return nullptr;
+
+        if (lexer.getCurToken() == ')')
+          break;
+
+        if (lexer.getCurToken() != ',')
+          return parseError<ExprAST>(", or )", "in argument list");
+        lexer.getNextToken();
+      }
+    }
+    lexer.consume(Token(')'));
+
+    // It can be a builtin call to print
+    if (name == "print") {
+      if (args.size() != 1)
+        return parseError<ExprAST>("<single arg>", "as argument to print()");
+
+      return std::make_unique<PrintExprAST>(std::move(loc), std::move(args[0]));
+    }
+
+    // Call to a user-defined function
+    return std::make_unique<CallExprAST>(std::move(loc), name, std::move(args));
+  }
+
+  /// primary
+  ///   ::= identifierexpr
+  ///   ::= numberexpr
+  ///   ::= parenexpr
+  ///   ::= tensorliteral
+  std::unique_ptr<ExprAST> parsePrimary() {
+    switch (lexer.getCurToken()) {
+    default:
+      llvm::errs() << "unknown token '" << lexer.getCurToken()
+                   << "' when expecting an expression\n";
+      return nullptr;
+    case tok_identifier:
+      return parseIdentifierExpr();
+    case tok_number:
+      return parseNumberExpr();
+    case '(':
+      return parseParenExpr();
+    case '[':
+      return parseTensorLiteralExpr();
+    case ';':
+      return nullptr;
+    case '}':
+      return nullptr;
+    }
+  }
+
+  /// Recursively parse the right hand side of a binary expression, the ExprPrec
+  /// argument indicates the precedence of the current binary operator.
+  ///
+  /// binoprhs ::= ('+' primary)*
+  std::unique_ptr<ExprAST> parseBinOpRHS(int exprPrec,
+                                         std::unique_ptr<ExprAST> lhs) {
+    // If this is a binop, find its precedence.
+    while (true) {
+      int tokPrec = getTokPrecedence();
+
+      // If this is a binop that binds at least as tightly as the current binop,
+      // consume it, otherwise we are done.
+      if (tokPrec < exprPrec)
+        return lhs;
+
+      // Okay, we know this is a binop.
+      int binOp = lexer.getCurToken();
+      lexer.consume(Token(binOp));
+      auto loc = lexer.getLastLocation();
+
+      // Parse the primary expression after the binary operator.
+      auto rhs = parsePrimary();
+      if (!rhs)
+        return parseError<ExprAST>("expression", "to complete binary operator");
+
+      // If BinOp binds less tightly with rhs than the operator after rhs, let
+      // the pending operator take rhs as its lhs.
+      int nextPrec = getTokPrecedence();
+      if (tokPrec < nextPrec) {
+        rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs));
+        if (!rhs)
+          return nullptr;
+      }
+
+      // Merge lhs/RHS.
+      lhs = std::make_unique<BinaryExprAST>(std::move(loc), binOp,
+                                            std::move(lhs), std::move(rhs));
+    }
+  }
+
+  /// expression::= primary binop rhs
+  std::unique_ptr<ExprAST> parseExpression() {
+    auto lhs = parsePrimary();
+    if (!lhs)
+      return nullptr;
+
+    return parseBinOpRHS(0, std::move(lhs));
+  }
+
+  /// type ::= < shape_list >
+  /// shape_list ::= num | num , shape_list
+  std::unique_ptr<VarType> parseType() {
+    if (lexer.getCurToken() != '<')
+      return parseError<VarType>("<", "to begin type");
+    lexer.getNextToken(); // eat <
+
+    auto type = std::make_unique<VarType>();
+
+    while (lexer.getCurToken() == tok_number) {
+      type->shape.push_back(lexer.getValue());
+      lexer.getNextToken();
+      if (lexer.getCurToken() == ',')
+        lexer.getNextToken();
+    }
+
+    if (lexer.getCurToken() != '>')
+      return parseError<VarType>(">", "to end type");
+    lexer.getNextToken(); // eat >
+    return type;
+  }
+
+  /// Parse a variable declaration, it starts with a `var` keyword followed by
+  /// and identifier and an optional type (shape specification) before the
+  /// initializer.
+  /// decl ::= var identifier [ type ] = expr
+  std::unique_ptr<VarDeclExprAST> parseDeclaration() {
+    if (lexer.getCurToken() != tok_var)
+      return parseError<VarDeclExprAST>("var", "to begin declaration");
+    auto loc = lexer.getLastLocation();
+    lexer.getNextToken(); // eat var
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<VarDeclExprAST>("identified",
+                                        "after 'var' declaration");
+    std::string id(lexer.getId());
+    lexer.getNextToken(); // eat id
+
+    std::unique_ptr<VarType> type; // Type is optional, it can be inferred
+    if (lexer.getCurToken() == '<') {
+      type = parseType();
+      if (!type)
+        return nullptr;
+    }
+
+    if (!type)
+      type = std::make_unique<VarType>();
+    lexer.consume(Token('='));
+    auto expr = parseExpression();
+    return std::make_unique<VarDeclExprAST>(std::move(loc), std::move(id),
+                                            std::move(*type), std::move(expr));
+  }
+
+  /// Parse a block: a list of expression separated by semicolons and wrapped in
+  /// curly braces.
+  ///
+  /// block ::= { expression_list }
+  /// expression_list ::= block_expr ; expression_list
+  /// block_expr ::= decl | "return" | expr
+  std::unique_ptr<ExprASTList> parseBlock() {
+    if (lexer.getCurToken() != '{')
+      return parseError<ExprASTList>("{", "to begin block");
+    lexer.consume(Token('{'));
+
+    auto exprList = std::make_unique<ExprASTList>();
+
+    // Ignore empty expressions: swallow sequences of semicolons.
+    while (lexer.getCurToken() == ';')
+      lexer.consume(Token(';'));
+
+    while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) {
+      if (lexer.getCurToken() == tok_var) {
+        // Variable declaration
+        auto varDecl = parseDeclaration();
+        if (!varDecl)
+          return nullptr;
+        exprList->push_back(std::move(varDecl));
+      } else if (lexer.getCurToken() == tok_return) {
+        // Return statement
+        auto ret = parseReturn();
+        if (!ret)
+          return nullptr;
+        exprList->push_back(std::move(ret));
+      } else {
+        // General expression
+        auto expr = parseExpression();
+        if (!expr)
+          return nullptr;
+        exprList->push_back(std::move(expr));
+      }
+      // Ensure that elements are separated by a semicolon.
+      if (lexer.getCurToken() != ';')
+        return parseError<ExprASTList>(";", "after expression");
+
+      // Ignore empty expressions: swallow sequences of semicolons.
+      while (lexer.getCurToken() == ';')
+        lexer.consume(Token(';'));
+    }
+
+    if (lexer.getCurToken() != '}')
+      return parseError<ExprASTList>("}", "to close block");
+
+    lexer.consume(Token('}'));
+    return exprList;
+  }
+
+  /// prototype ::= def id '(' decl_list ')'
+  /// decl_list ::= identifier | identifier, decl_list
+  std::unique_ptr<PrototypeAST> parsePrototype() {
+    auto loc = lexer.getLastLocation();
+
+    if (lexer.getCurToken() != tok_def)
+      return parseError<PrototypeAST>("def", "in prototype");
+    lexer.consume(tok_def);
+
+    if (lexer.getCurToken() != tok_identifier)
+      return parseError<PrototypeAST>("function name", "in prototype");
+
+    std::string fnName(lexer.getId());
+    lexer.consume(tok_identifier);
+
+    if (lexer.getCurToken() != '(')
+      return parseError<PrototypeAST>("(", "in prototype");
+    lexer.consume(Token('('));
+
+    std::vector<std::unique_ptr<VariableExprAST>> args;
+    if (lexer.getCurToken() != ')') {
+      do {
+        std::string name(lexer.getId());
+        auto loc = lexer.getLastLocation();
+        lexer.consume(tok_identifier);
+        auto decl = std::make_unique<VariableExprAST>(std::move(loc), name);
+        args.push_back(std::move(decl));
+        if (lexer.getCurToken() != ',')
+          break;
+        lexer.consume(Token(','));
+        if (lexer.getCurToken() != tok_identifier)
+          return parseError<PrototypeAST>(
+              "identifier", "after ',' in function parameter list");
+      } while (true);
+    }
+    if (lexer.getCurToken() != ')')
+      return parseError<PrototypeAST>(")", "to end function prototype");
+
+    // success.
+    lexer.consume(Token(')'));
+    return std::make_unique<PrototypeAST>(std::move(loc), fnName,
+                                          std::move(args));
+  }
+
+  /// Parse a function definition, we expect a prototype initiated with the
+  /// `def` keyword, followed by a block containing a list of expressions.
+  ///
+  /// definition ::= prototype block
+  std::unique_ptr<FunctionAST> parseDefinition() {
+    auto proto = parsePrototype();
+    if (!proto)
+      return nullptr;
+
+    if (auto block = parseBlock())
+      return std::make_unique<FunctionAST>(std::move(proto), std::move(block));
+    return nullptr;
+  }
+
+  /// Get the precedence of the pending binary operator token.
+  int getTokPrecedence() {
+    if (!isascii(lexer.getCurToken()))
+      return -1;
+
+    // 1 is lowest precedence.
+    switch (static_cast<char>(lexer.getCurToken())) {
+    case '-':
+      return 20;
+    case '+':
+      return 20;
+    case '*':
+      return 40;
+    default:
+      return -1;
+    }
+  }
+
+  /// Helper function to signal errors while parsing, it takes an argument
+  /// indicating the expected token and another argument giving more context.
+  /// Location is retrieved from the lexer to enrich the error message.
+  template <typename R, typename T, typename U = const char *>
+  std::unique_ptr<R> parseError(T &&expected, U &&context = "") {
+    auto curToken = lexer.getCurToken();
+    llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", "
+                 << lexer.getLastLocation().col << "): expected '" << expected
+                 << "' " << context << " but has Token " << curToken;
+    if (isprint(curToken))
+      llvm::errs() << " '" << (char)curToken << "'";
+    llvm::errs() << "\n";
+    return nullptr;
+  }
+};
+
+} // namespace toy
+
+#endif // TOY_PARSER_H
diff --git a/mlir/cuda-tile/Toy/include/toy/Passes.h b/mlir/cuda-tile/Toy/include/toy/Passes.h
new file mode 100644
index 0000000..0b057c1
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/Passes.h
@@ -0,0 +1,44 @@
+//===- Passes.h - Toy Passes Definition -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes the entry points to create compiler passes for Toy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_PASSES_H
+#define TOY_PASSES_H
+
+#include <memory>
+#include <string>
+
+namespace mlir {
+class Pass;
+
+namespace toy {
+std::unique_ptr<Pass> createShapeInferencePass();
+
+/// Create a pass for lowering to operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<mlir::Pass> createLowerToAffinePass();
+
+/// Create a pass for lowering operations the remaining `Toy` operations, as
+/// well as `Affine` and `Std`, to the LLVM dialect for codegen.
+std::unique_ptr<mlir::Pass> createLowerToLLVMPass();
+
+std::unique_ptr<mlir::Pass> createGpuOutlinePass(std::string grid = "1,1,1");
+
+std::unique_ptr<mlir::Pass> createCudaTileLoweringPass();
+
+std::unique_ptr<mlir::Pass>
+createEmbedCudaTileBinaryPass(std::string tileirasExe = "tileiras",
+                              std::string gpuName = "sm_120");
+
+} // namespace toy
+} // namespace mlir
+
+#endif // TOY_PASSES_H
diff --git a/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.h b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.h
new file mode 100644
index 0000000..cfe5a87
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.h
@@ -0,0 +1,28 @@
+//===- ShapeInferenceInterface.h - Interface definitions for ShapeInference -=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the shape inference interfaces defined
+// in ShapeInferenceInterface.td.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+#define MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+namespace toy {
+
+/// Include the auto-generated declarations.
+#include "toy/ShapeInferenceOpInterfaces.h.inc"
+
+} // namespace toy
+} // namespace mlir
+
+#endif // MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_
diff --git a/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.td b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.td
new file mode 100644
index 0000000..2279015
--- /dev/null
+++ b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.td
@@ -0,0 +1,30 @@
+//===- ShapeInferenceInterface.td - Shape Inference Interface -*- tablegen -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the operations of the Shape Inference Op Interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SHAPE_INFERENCE_INTERFACE
+#define SHAPE_INFERENCE_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> {
+  let description = [{
+    Interface to access a registered method to infer the return types for an
+    operation that can be used during type inference.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Infer and set the output shape for the current operation.",
+                    "void", "inferShapes">
+  ];
+}
+
+#endif // SHAPE_INFERENCE_INTERFACE
diff --git a/mlir/cuda-tile/Toy/mlir/Dialect.cpp b/mlir/cuda-tile/Toy/mlir/Dialect.cpp
new file mode 100644
index 0000000..a1dca39
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/Dialect.cpp
@@ -0,0 +1,572 @@
+//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the dialect for the Toy IR: custom type parsing and
+// operation verification.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace mlir;
+using namespace mlir::toy;
+
+#include "toy/Dialect.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// ToyInlinerInterface
+//===----------------------------------------------------------------------===//
+
+/// This class defines the interface for handling inlining with Toy
+/// operations.
+struct ToyInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// All call operations within toy can be inlined.
+  bool isLegalToInline(Operation *call, Operation *callable,
+                       bool wouldBeCloned) const final {
+    return true;
+  }
+
+  /// All operations within toy can be inlined.
+  bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final {
+    return true;
+  }
+
+  // All functions within toy can be inlined.
+  bool isLegalToInline(Region *, Region *, bool, IRMapping &) const final {
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  /// Handle the given inlined terminator(toy.return) by replacing it with a new
+  /// operation as necessary.
+  void handleTerminator(Operation *op, ValueRange valuesToRepl) const final {
+    // Only "toy.return" needs to be handled here.
+    auto returnOp = cast<ReturnOp>(op);
+
+    // Replace the values directly with the return operands.
+    assert(returnOp.getNumOperands() == valuesToRepl.size());
+    for (const auto &it : llvm::enumerate(returnOp.getOperands()))
+      valuesToRepl[it.index()].replaceAllUsesWith(it.value());
+  }
+
+  /// Attempts to materialize a conversion for a type mismatch between a call
+  /// from this dialect, and a callable region. This method should generate an
+  /// operation that takes 'input' as the only operand, and produces a single
+  /// result of 'resultType'. If a conversion can not be generated, nullptr
+  /// should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type resultType,
+                                       Location conversionLoc) const final {
+    return CastOp::create(builder, conversionLoc, resultType, input);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyDialect
+//===----------------------------------------------------------------------===//
+
+/// Dialect initialization, the instance will be owned by the context. This is
+/// the point of registration of types and operations for the dialect.
+void ToyDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "toy/Ops.cpp.inc"
+      >();
+  addInterfaces<ToyInlinerInterface>();
+}
+
+//===----------------------------------------------------------------------===//
+// Toy Operations
+//===----------------------------------------------------------------------===//
+
+/// A generalized parser for binary operations. This parses the different forms
+/// of 'printBinaryOp' below.
+static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser,
+                                       mlir::OperationState &result) {
+  SmallVector<mlir::OpAsmParser::UnresolvedOperand, 2> operands;
+  SMLoc operandsLoc = parser.getCurrentLocation();
+  Type type;
+  if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseColonType(type))
+    return mlir::failure();
+
+  // If the type is a function type, it contains the input and result types of
+  // this operation.
+  if (FunctionType funcType = llvm::dyn_cast<FunctionType>(type)) {
+    if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc,
+                               result.operands))
+      return mlir::failure();
+    result.addTypes(funcType.getResults());
+    return mlir::success();
+  }
+
+  // Otherwise, the parsed type is the type of both operands and results.
+  if (parser.resolveOperands(operands, type, result.operands))
+    return mlir::failure();
+  result.addTypes(type);
+  return mlir::success();
+}
+
+/// A generalized printer for binary operations. It prints in two different
+/// forms depending on if all of the types match.
+static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) {
+  printer << " " << op->getOperands();
+  printer.printOptionalAttrDict(op->getAttrs());
+  printer << " : ";
+
+  // If all of the types are the same, print the type directly.
+  Type resultType = *op->result_type_begin();
+  if (llvm::all_of(op->getOperandTypes(),
+                   [=](Type type) { return type == resultType; })) {
+    printer << resultType;
+    return;
+  }
+
+  // Otherwise, print a functional type.
+  printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes());
+}
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+/// Build a constant operation.
+/// The builder is passed as an argument, so is the state that this method is
+/// expected to fill in order to build the operation.
+void ConstantOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                       float value) {
+  auto dataType = RankedTensorType::get({}, builder.getF32Type());
+  auto dataAttribute = DenseElementsAttr::get(dataType, value);
+  ConstantOp::build(builder, state, dataType, dataAttribute);
+}
+
+/// The 'OpAsmParser' class provides a collection of methods for parsing
+/// various punctuation, as well as attributes, operands, types, etc. Each of
+/// these methods returns a `ParseResult`. This class is a wrapper around
+/// `LogicalResult` that can be converted to a boolean `true` value on failure,
+/// or `false` on success. This allows for easily chaining together a set of
+/// parser rules. These rules are used to populate an `mlir::OperationState`
+/// similarly to the `build` methods described above.
+mlir::ParseResult ConstantOp::parse(mlir::OpAsmParser &parser,
+                                    mlir::OperationState &result) {
+  mlir::DenseElementsAttr value;
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.parseAttribute(value, "value", result.attributes))
+    return failure();
+
+  result.addTypes(value.getType());
+  return success();
+}
+
+/// The 'OpAsmPrinter' class is a stream that allows for formatting
+/// strings, attributes, operands, types, etc.
+void ConstantOp::print(mlir::OpAsmPrinter &printer) {
+  printer << " ";
+  printer.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{"value"});
+  printer << getValue();
+}
+
+/// Verifier for the constant operation. This corresponds to the
+/// `let hasVerifier = 1` in the op definition.
+llvm::LogicalResult ConstantOp::verify() {
+  // If the return type of the constant is not an unranked tensor, the shape
+  // must match the shape of the attribute holding the data.
+  auto resultType =
+      llvm::dyn_cast<mlir::RankedTensorType>(getResult().getType());
+  if (!resultType)
+    return success();
+
+  // Check that the rank of the attribute type matches the rank of the constant
+  // result type.
+  auto attrType = llvm::cast<mlir::RankedTensorType>(getValue().getType());
+  if (attrType.getRank() != resultType.getRank()) {
+    return emitOpError("return type must match the one of the attached value "
+                       "attribute: ")
+           << attrType.getRank() << " != " << resultType.getRank();
+  }
+
+  // Check that each of the dimensions match between the two types.
+  for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) {
+    if (attrType.getShape()[dim] != resultType.getShape()[dim]) {
+      return emitOpError(
+                 "return type shape mismatches its attribute at dimension ")
+             << dim << ": " << attrType.getShape()[dim]
+             << " != " << resultType.getShape()[dim];
+    }
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp
+//===----------------------------------------------------------------------===//
+
+void AddOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder.getF32Type()));
+  state.addOperands({lhs, rhs});
+}
+
+mlir::ParseResult AddOp::parse(mlir::OpAsmParser &parser,
+                               mlir::OperationState &result) {
+  return parseBinaryOp(parser, result);
+}
+
+void AddOp::print(mlir::OpAsmPrinter &p) { printBinaryOp(p, *this); }
+
+/// Infer the output shape of the AddOp, this is required by the shape inference
+/// interface.
+void AddOp::inferShapes() { getResult().setType(getLhs().getType()); }
+
+//===----------------------------------------------------------------------===//
+// CastOp
+//===----------------------------------------------------------------------===//
+
+/// Infer the output shape of the CastOp, this is required by the shape
+/// inference interface.
+void CastOp::inferShapes() { getResult().setType(getInput().getType()); }
+
+/// Returns true if the given set of input and result types are compatible with
+/// this cast operation. This is required by the `CastOpInterface` to verify
+/// this operation and provide other additional utilities.
+bool CastOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
+  if (inputs.size() != 1 || outputs.size() != 1)
+    return false;
+  // The inputs must be Tensors with the same element type.
+  TensorType input = llvm::dyn_cast<TensorType>(inputs.front());
+  TensorType output = llvm::dyn_cast<TensorType>(outputs.front());
+  if (!input || !output || input.getElementType() != output.getElementType())
+    return false;
+  // The shape is required to match if both types are ranked.
+  return !input.hasRank() || !output.hasRank() || input == output;
+}
+
+//===----------------------------------------------------------------------===//
+// FuncOp
+//===----------------------------------------------------------------------===//
+
+void FuncOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                   llvm::StringRef name, mlir::FunctionType type,
+                   llvm::ArrayRef<mlir::NamedAttribute> attrs) {
+  // FunctionOpInterface provides a convenient `build` method that will populate
+  // the state of our FuncOp, and create an entry block.
+  buildWithEntryBlock(builder, state, name, type, attrs, type.getInputs());
+}
+
+mlir::ParseResult FuncOp::parse(mlir::OpAsmParser &parser,
+                                mlir::OperationState &result) {
+  // Dispatch to the FunctionOpInterface provided utility method that parses the
+  // function operation.
+  auto buildFuncType =
+      [](mlir::Builder &builder, llvm::ArrayRef<mlir::Type> argTypes,
+         llvm::ArrayRef<mlir::Type> results,
+         mlir::function_interface_impl::VariadicFlag,
+         std::string &) { return builder.getFunctionType(argTypes, results); };
+
+  return mlir::function_interface_impl::parseFunctionOp(
+      parser, result, /*allowVariadic=*/false,
+      getFunctionTypeAttrName(result.name), buildFuncType,
+      getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name));
+}
+
+void FuncOp::print(mlir::OpAsmPrinter &p) {
+  // Dispatch to the FunctionOpInterface provided utility method that prints the
+  // function operation.
+  mlir::function_interface_impl::printFunctionOp(
+      p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(),
+      getArgAttrsAttrName(), getResAttrsAttrName());
+}
+
+//===----------------------------------------------------------------------===//
+// GenericCallOp
+//===----------------------------------------------------------------------===//
+
+void GenericCallOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                          StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder.getF32Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee",
+                     mlir::SymbolRefAttr::get(builder.getContext(), callee));
+}
+
+/// Return the callee of the generic call operation, this is required by the
+/// call interface.
+CallInterfaceCallable GenericCallOp::getCallableForCallee() {
+  return (*this)->getAttrOfType<SymbolRefAttr>("callee");
+}
+
+/// Set the callee for the generic call operation, this is required by the call
+/// interface.
+void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
+  (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
+}
+
+/// Get the argument operands to the called function, this is required by the
+/// call interface.
+Operation::operand_range GenericCallOp::getArgOperands() { return getInputs(); }
+
+/// Get the argument operands to the called function as a mutable range, this is
+/// required by the call interface.
+MutableOperandRange GenericCallOp::getArgOperandsMutable() {
+  return getInputsMutable();
+}
+
+//===----------------------------------------------------------------------===//
+// MulOp
+//===----------------------------------------------------------------------===//
+
+void MulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                  mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder.getF32Type()));
+  state.addOperands({lhs, rhs});
+}
+
+mlir::ParseResult MulOp::parse(mlir::OpAsmParser &parser,
+                               mlir::OperationState &result) {
+  return parseBinaryOp(parser, result);
+}
+
+void MulOp::print(mlir::OpAsmPrinter &p) { printBinaryOp(p, *this); }
+
+/// Infer the output shape of the MulOp, this is required by the shape inference
+/// interface.
+void MulOp::inferShapes() { getResult().setType(getLhs().getType()); }
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult ReturnOp::verify() {
+  // Parent can be FuncOp or GPUFuncOp; both implement FunctionOpInterface.
+  auto *parent = (*this)->getParentOp();
+  auto function = dyn_cast<FunctionOpInterface>(parent);
+  if (!function)
+    return emitOpError() << "must be enclosed in a function-like op";
+
+  /// ReturnOps can only have a single optional operand.
+  if (getNumOperands() > 1)
+    return emitOpError() << "expects at most 1 return operand";
+
+  // The operand number and types must match the function signature.
+  auto funcType = llvm::cast<FunctionType>(function.getFunctionType());
+  const auto &results = funcType.getResults();
+  if (getNumOperands() != results.size())
+    return emitOpError() << "does not return the same number of values ("
+                         << getNumOperands() << ") as the enclosing function ("
+                         << results.size() << ")";
+
+  // If the operation does not have an input, we are done.
+  if (!hasOperand())
+    return mlir::success();
+
+  auto inputType = *operand_type_begin();
+  auto resultType = results.front();
+
+  // Check that the result type of the function matches the operand type.
+  if (inputType == resultType ||
+      llvm::isa<mlir::UnrankedTensorType>(inputType) ||
+      llvm::isa<mlir::UnrankedTensorType>(resultType))
+    return mlir::success();
+
+  return emitError() << "type of return operand (" << inputType
+                     << ") doesn't match function result type (" << resultType
+                     << ")";
+}
+
+//===----------------------------------------------------------------------===//
+// TransposeOp
+//===----------------------------------------------------------------------===//
+
+void TransposeOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                        mlir::Value value) {
+  state.addTypes(UnrankedTensorType::get(builder.getF32Type()));
+  state.addOperands(value);
+}
+
+void TransposeOp::inferShapes() {
+  auto arrayTy = llvm::cast<RankedTensorType>(getOperand().getType());
+  SmallVector<int64_t, 2> dims(llvm::reverse(arrayTy.getShape()));
+  getResult().setType(RankedTensorType::get(dims, arrayTy.getElementType()));
+}
+
+llvm::LogicalResult TransposeOp::verify() {
+  auto inputType = llvm::dyn_cast<RankedTensorType>(getOperand().getType());
+  auto resultType = llvm::dyn_cast<RankedTensorType>(getType());
+  if (!inputType || !resultType)
+    return mlir::success();
+
+  auto inputShape = inputType.getShape();
+  if (!std::equal(inputShape.begin(), inputShape.end(),
+                  resultType.getShape().rbegin())) {
+    return emitError()
+           << "expected result shape to be a transpose of the input";
+  }
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// MatMulOp
+//===----------------------------------------------------------------------===//
+
+void MatMulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                     mlir::Value lhs, mlir::Value rhs) {
+  state.addTypes(UnrankedTensorType::get(builder.getF32Type()));
+  state.addOperands({lhs, rhs});
+}
+
+/// Infer the output shape of the MatMulOp, this is required by the shape
+/// inference interface.
+void MatMulOp::inferShapes() {
+  RankedTensorType lhsType =
+      llvm::dyn_cast<RankedTensorType>(getLhs().getType());
+  RankedTensorType rhsType =
+      llvm::dyn_cast<RankedTensorType>(getRhs().getType());
+  auto lhsShape = lhsType.getShape();
+  auto rhsShape = rhsType.getShape();
+  RankedTensorType res_type = RankedTensorType::get({lhsShape[0], rhsShape[1]},
+                                                    lhsType.getElementType());
+  getResult().setType(res_type);
+}
+
+llvm::LogicalResult MatMulOp::verify() {
+  auto lhsType = llvm::dyn_cast<RankedTensorType>(getLhs().getType());
+  auto rhsType = llvm::dyn_cast<RankedTensorType>(getRhs().getType());
+  auto resultType = llvm::dyn_cast<RankedTensorType>(getType());
+
+  if (!lhsType || !rhsType || !resultType)
+    return mlir::success();
+
+  auto lhsShape = lhsType.getShape();
+  auto rhsShape = rhsType.getShape();
+
+  if (lhsShape.size() != 2 || rhsShape.size() != 2) {
+    return emitOpError() << "expected 2D matrix";
+  }
+
+  if (lhsShape[1] != rhsShape[0]) {
+    return emitOpError() << "expected dimension to match"
+                         << "the shape of lhs is [" << lhsShape[0] << ", "
+                         << lhsShape[1] << "] "
+                         << "the shape of rhs is [" << rhsShape[0] << ", "
+                         << rhsShape[1] << "] "
+                         << "but the dimension " << lhsShape[1]
+                         << "!=" << rhsShape[0] << '\n';
+  }
+
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// LaunchGpuOp
+//===----------------------------------------------------------------------===//
+
+void LaunchGpuOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                        StringRef callee, ArrayRef<mlir::Value> arguments) {
+  // Generic call always returns an unranked Tensor initially.
+  state.addTypes(UnrankedTensorType::get(builder.getF32Type()));
+  state.addOperands(arguments);
+  state.addAttribute("callee",
+                     mlir::SymbolRefAttr::get(builder.getContext(), callee));
+  state.addAttribute("grid", builder.getI64ArrayAttr({1, 1, 1}));
+}
+
+/// Return the callee of the generic call operation, this is required by the
+/// call interface.
+CallInterfaceCallable LaunchGpuOp::getCallableForCallee() {
+  return (*this)->getAttrOfType<SymbolRefAttr>("callee");
+}
+
+/// Set the callee for the generic call operation, this is required by the call
+/// interface.
+void LaunchGpuOp::setCalleeFromCallable(CallInterfaceCallable callee) {
+  (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
+}
+
+/// Get the argument operands to the called function, this is required by the
+/// call interface.
+Operation::operand_range LaunchGpuOp::getArgOperands() { return getInputs(); }
+
+/// Get the argument operands to the called function as a mutable range, this is
+/// required by the call interface.
+MutableOperandRange LaunchGpuOp::getArgOperandsMutable() {
+  return getInputsMutable();
+}
+
+//===----------------------------------------------------------------------===//
+// GPUFuncOp
+//===----------------------------------------------------------------------===//
+
+void GPUFuncOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
+                      llvm::StringRef name, mlir::FunctionType type,
+                      llvm::ArrayRef<mlir::NamedAttribute> attrs) {
+  // FunctionOpInterface provides a convenient `build` method that will populate
+  // the state of our GPUFuncOp, and create an entry block.
+  buildWithEntryBlock(builder, state, name, type, attrs, type.getInputs());
+}
+
+mlir::ParseResult GPUFuncOp::parse(mlir::OpAsmParser &parser,
+                                   mlir::OperationState &result) {
+  // Dispatch to the FunctionOpInterface provided utility method that parses the
+  // function operation.
+  auto buildFuncType =
+      [](mlir::Builder &builder, llvm::ArrayRef<mlir::Type> argTypes,
+         llvm::ArrayRef<mlir::Type> results,
+         mlir::function_interface_impl::VariadicFlag,
+         std::string &) { return builder.getFunctionType(argTypes, results); };
+
+  return mlir::function_interface_impl::parseFunctionOp(
+      parser, result, /*allowVariadic=*/false,
+      getFunctionTypeAttrName(result.name), buildFuncType,
+      getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name));
+}
+
+void GPUFuncOp::print(mlir::OpAsmPrinter &p) {
+  // Dispatch to the FunctionOpInterface provided utility method that prints the
+  // function operation.
+  mlir::function_interface_impl::printFunctionOp(
+      p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(),
+      getArgAttrsAttrName(), getResAttrsAttrName());
+}
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "toy/Ops.cpp.inc"
diff --git a/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp b/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp
new file mode 100644
index 0000000..2baf7a0
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp
@@ -0,0 +1,196 @@
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/Pass.h"
+
+#include "cuda_tile/Bytecode/Writer/BytecodeWriter.h"
+#include "cuda_tile/Dialect/CudaTile/IR/Ops.h"
+#include "toy/Dialect.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <system_error>
+
+using namespace llvm;
+using namespace mlir;
+
+namespace {
+
+/// Read file contents as raw bytes.
+static FailureOr<std::vector<int8_t>> readFileBytes(StringRef path) {
+  auto bufOrErr = MemoryBuffer::getFile(path, /*IsText=*/false);
+  if (!bufOrErr)
+    return failure();
+  auto &buf = *bufOrErr.get();
+  std::vector<int8_t> out(buf.getBufferSize());
+  memcpy(out.data(), buf.getBufferStart(), buf.getBufferSize());
+  return out;
+}
+
+/// Write raw bytes to a file.
+static LogicalResult writeFileBytes(StringRef path, ArrayRef<char> bytes) {
+  std::error_code ec;
+  raw_fd_ostream os(path, ec, sys::fs::OF_None);
+  if (ec)
+    return failure();
+  os.write(bytes.data(), bytes.size());
+  os.flush();
+  return success();
+}
+
+/// Execute external tileiras to assemble tilebc into a binary.
+static LogicalResult runTileIRAS(Operation *anchor, StringRef tileirasExe,
+                                 StringRef gpuName, StringRef inTilebc,
+                                 StringRef outBin) {
+  SmallVector<StringRef, 16> args;
+  args.push_back(tileirasExe);
+  args.push_back("--gpu-name");
+  args.push_back(gpuName);
+  args.push_back(inTilebc);
+  args.push_back("-o");
+  args.push_back(outBin);
+
+  std::string errMsg;
+  int rc = sys::ExecuteAndWait(tileirasExe, args,
+                               /*env=*/std::nullopt,
+                               /*redirects=*/{},
+                               /*secondsToWait=*/0,
+                               /*memoryLimit=*/0, &errMsg);
+  if (rc != 0) {
+    return anchor->emitError() << "tileiras failed, rc=" << rc << "\n"
+                               << errMsg;
+  }
+  return success();
+}
+
+std::error_code createTemporaryFile(SmallVectorImpl<char> &inPath,
+                                    StringRef prefix, StringRef suffix) {
+  int inFD = -1;
+  if (std::error_code ec =
+          sys::fs::createTemporaryFile(prefix, suffix, inFD, inPath)) {
+    return ec;
+  }
+
+  if (std::error_code ec = sys::fs::closeFile(inFD)) {
+    return ec;
+  }
+  return std::error_code();
+}
+
+struct EmbedCudaTileBinaryPass
+    : public PassWrapper<EmbedCudaTileBinaryPass, OperationPass<ModuleOp>> {
+
+  std::string tileirasExe;
+  std::string gpuName;
+
+  EmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName)
+      : tileirasExe(std::move(tileirasExe)), gpuName(std::move(gpuName)) {}
+
+  void runOnOperation() override {
+    ModuleOp top = getOperation();
+    MLIRContext *ctx = top.getContext();
+
+    SmallString<256> cudaBinPath;
+
+    top.walk([&](Operation *op) {
+      // we assume the MLIR only have one cuda tile module.
+      if (op->getName().getStringRef() != "cuda_tile.module")
+        return;
+
+      auto cudaMod = dyn_cast<cuda_tile::ModuleOp>(op);
+      if (!cudaMod)
+        return;
+
+      // ---- Step B: generate tilebc bytes in-process ----
+      SmallVector<char, 0> tilebcBytes;
+      raw_svector_ostream tilebcOS(tilebcBytes);
+
+      // Using writeBytecode API: writeBytecode(output, moduleOp,
+      // BytecodeVersion::kCurrentVersion)
+      if (failed(writeBytecode(tilebcOS, cudaMod,
+                               cuda_tile::BytecodeVersion::kCurrentVersion))) {
+        op->emitError() << "writeBytecode(tilebc) failed";
+        signalPassFailure();
+        return;
+      }
+
+      // ---- Step C: create temp files and invoke tileiras ----
+      SmallString<256> inPath;
+
+      if (std::error_code ec =
+              createTemporaryFile(inPath, "cuda_tile", "tilebc")) {
+        op->emitError() << "failed to create temp in tilebc: " << ec.message();
+        signalPassFailure();
+        return;
+      }
+
+      if (std::error_code ec =
+              createTemporaryFile(cudaBinPath, "cuda_tile", "bin")) {
+        op->emitError() << "failed to create temp out bin: " << ec.message();
+        signalPassFailure();
+        return;
+      }
+
+      if (failed(writeFileBytes(inPath, tilebcBytes))) {
+        op->emitError() << "failed to write temp tilebc";
+        signalPassFailure();
+        return;
+      }
+
+      if (failed(runTileIRAS(op, tileirasExe, gpuName, inPath, cudaBinPath))) {
+        signalPassFailure();
+        return;
+      }
+    });
+
+    top->walk([&](toy::LaunchGpuOp launchOp) {
+      // ---- Step D: read cuda binary bytes ----
+      auto binBytesOrErr = readFileBytes(cudaBinPath);
+      if (failed(binBytesOrErr)) {
+        launchOp.emitError() << "failed to read cuda binary file";
+        signalPassFailure();
+        return;
+      }
+      auto binBytes = *binBytesOrErr;
+
+      // ---- Step E: embed binary as LaunchGpuOp attributes ----
+      llvm::SmallVector<uint8_t, 0> binU8Bytes;
+      binU8Bytes.reserve(binBytes.size());
+      for (auto b : binBytes)
+        binU8Bytes.push_back(static_cast<uint8_t>(b));
+
+      auto byteAttr = mlir::DenseIntElementsAttr::get(
+          mlir::RankedTensorType::get({static_cast<int64_t>(binU8Bytes.size())},
+                                      mlir::IntegerType::get(ctx, 8)),
+          binU8Bytes);
+
+      // launchOp->setAttr("cuda_binary", byteAttr);
+      launchOp->setAttr("cuda_binary_size",
+                        mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 64),
+                                               binU8Bytes.size()));
+      launchOp->setAttr("cuda_binary_path",
+                        mlir::StringAttr::get(ctx, cudaBinPath.str()));
+      launchOp->setAttr("cuda_arch", mlir::StringAttr::get(ctx, gpuName));
+    });
+
+    // ---- Step F: Delete the cuda_tile.module ops ----
+    llvm::SmallVector<mlir::Operation *, 32> toErase;
+    top->walk([&](cuda_tile::ModuleOp op) { toErase.push_back(op); });
+
+    for (auto op : toErase) {
+      op->erase();
+    }
+  };
+};
+} // namespace
+
+namespace mlir::toy {
+
+std::unique_ptr<mlir::Pass>
+createEmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName) {
+  return std::make_unique<EmbedCudaTileBinaryPass>(tileirasExe, gpuName);
+};
+
+}; // namespace mlir::toy
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp b/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp
new file mode 100644
index 0000000..3fc59c0
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp
@@ -0,0 +1,459 @@
+//====- LowerToAffineLoops.cpp - Partial lowering from Toy to Affine+Std --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a partial lowering of Toy operations to a combination of
+// affine loops, memref operations and standard operations. This lowering
+// expects that all calls have been inlined, and all shapes have been resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+/// Convert the given RankedTensorType into the corresponding MemRefType.
+static MemRefType convertTensorToMemRef(RankedTensorType type) {
+  return MemRefType::get(type.getShape(), type.getElementType());
+}
+
+/// Insert an allocation and deallocation for the given MemRefType.
+static Value insertAllocAndDealloc(MemRefType type, Location loc,
+                                   PatternRewriter &rewriter) {
+  auto alloc = memref::AllocOp::create(rewriter, loc, type);
+
+  // Make sure to allocate at the beginning of the block.
+  auto *parentBlock = alloc->getBlock();
+  alloc->moveBefore(&parentBlock->front());
+
+  // Make sure to deallocate this alloc at the end of the block. This is fine
+  // as toy functions have no control flow.
+  auto dealloc = memref::DeallocOp::create(rewriter, loc, alloc);
+  dealloc->moveBefore(&parentBlock->back());
+  return alloc;
+}
+
+/// This defines the function type used to process an iteration of a lowered
+/// loop. It takes as input an OpBuilder and the range of loop induction
+/// variables for the iteration. It returns a value to store at the current
+/// index of the iteration.
+using LoopIterationFn =
+    function_ref<Value(OpBuilder &rewriter, ValueRange loopIvs)>;
+
+static void lowerOpToLoops(Operation *op, PatternRewriter &rewriter,
+                           LoopIterationFn processIteration) {
+  auto tensorType = llvm::cast<RankedTensorType>((*op->result_type_begin()));
+  auto loc = op->getLoc();
+
+  // Insert an allocation and deallocation for the result of this operation.
+  auto memRefType = convertTensorToMemRef(tensorType);
+  auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+  // Create a nest of affine loops, with one loop per dimension of the shape.
+  // The buildAffineLoopNest function takes a callback that is used to construct
+  // the body of the innermost loop given a builder, a location and a range of
+  // loop induction variables.
+  SmallVector<int64_t, 4> lowerBounds(tensorType.getRank(), /*Value=*/0);
+  SmallVector<int64_t, 4> steps(tensorType.getRank(), /*Value=*/1);
+  affine::buildAffineLoopNest(
+      rewriter, loc, lowerBounds, tensorType.getShape(), steps,
+      [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+        // Call the processing function with the rewriter and the loop
+        // induction variables. This function will return the value to store at
+        // the current index.
+        Value valueToStore = processIteration(nestedBuilder, ivs);
+        affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc,
+                                      ivs);
+      });
+
+  // Replace this operation with the generated alloc.
+  rewriter.replaceOp(op, alloc);
+}
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ToyToAffine Conversion Patterns: Binary operations
+//===----------------------------------------------------------------------===//
+
+template <typename BinaryOp, typename LoweredBinaryOp>
+struct BinaryOpLowering : public OpConversionPattern<BinaryOp> {
+  using OpConversionPattern<BinaryOp>::OpConversionPattern;
+  using OpAdaptor = typename OpConversionPattern<BinaryOp>::OpAdaptor;
+
+  LogicalResult
+  matchAndRewrite(BinaryOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(op, rewriter, [&](OpBuilder &builder, ValueRange loopIvs) {
+      // Generate loads for the element of 'lhs' and 'rhs' at the
+      // inner loop.
+      auto loadedLhs =
+          affine::AffineLoadOp::create(builder, loc, adaptor.getLhs(), loopIvs);
+      auto loadedRhs =
+          affine::AffineLoadOp::create(builder, loc, adaptor.getRhs(), loopIvs);
+
+      // Create the binary operation performed on the loaded
+      // values.
+      return LoweredBinaryOp::create(builder, loc, loadedLhs, loadedRhs);
+    });
+    return success();
+  }
+};
+using AddOpLowering = BinaryOpLowering<toy::AddOp, arith::AddFOp>;
+using MulOpLowering = BinaryOpLowering<toy::MulOp, arith::MulFOp>;
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine Conversion Patterns: Constant operations
+//===----------------------------------------------------------------------===//
+
+struct ConstantOpLowering : public OpConversionPattern<toy::ConstantOp> {
+  using OpConversionPattern<toy::ConstantOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(toy::ConstantOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    DenseElementsAttr constantValue = op.getValue();
+    Location loc = op.getLoc();
+
+    // When lowering the constant operation, we allocate and assign the constant
+    // values to a corresponding memref allocation.
+    auto tensorType = llvm::cast<RankedTensorType>(op.getType());
+    auto memRefType = convertTensorToMemRef(tensorType);
+    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+    // We will be generating constant indices up-to the largest dimension.
+    // Create these constants up-front to avoid large amounts of redundant
+    // operations.
+    auto valueShape = memRefType.getShape();
+    SmallVector<Value, 8> constantIndices;
+
+    if (!valueShape.empty()) {
+      for (auto i : llvm::seq<int64_t>(0, *llvm::max_element(valueShape)))
+        constantIndices.push_back(
+            arith::ConstantIndexOp::create(rewriter, loc, i));
+    } else {
+      // This is the case of a tensor of rank 0.
+      constantIndices.push_back(
+          arith::ConstantIndexOp::create(rewriter, loc, 0));
+    }
+
+    // The constant operation represents a multi-dimensional constant, so we
+    // will need to generate a store for each of the elements. The following
+    // functor recursively walks the dimensions of the constant shape,
+    // generating a store when the recursion hits the base case.
+    SmallVector<Value, 2> indices;
+    auto valueIt = constantValue.value_begin<FloatAttr>();
+    std::function<void(uint64_t)> storeElements = [&](uint64_t dimension) {
+      // The last dimension is the base case of the recursion, at this point
+      // we store the element at the given index.
+      if (dimension == valueShape.size()) {
+        affine::AffineStoreOp::create(
+            rewriter, loc, arith::ConstantOp::create(rewriter, loc, *valueIt++),
+            alloc, llvm::ArrayRef(indices));
+        return;
+      }
+
+      // Otherwise, iterate over the current dimension and add the indices to
+      // the list.
+      for (uint64_t i = 0, e = valueShape[dimension]; i != e; ++i) {
+        indices.push_back(constantIndices[i]);
+        storeElements(dimension + 1);
+        indices.pop_back();
+      }
+    };
+
+    // Start the element storing recursion from the first dimension.
+    storeElements(/*dimension=*/0);
+
+    // Replace this operation with the generated alloc.
+    rewriter.replaceOp(op, alloc);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine Conversion Patterns: Func operations
+//===----------------------------------------------------------------------===//
+
+struct FuncOpLowering : public OpConversionPattern<toy::FuncOp> {
+  using OpConversionPattern<toy::FuncOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(toy::FuncOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    // We only lower the main function as we expect that all other functions
+    // have been inlined.
+    if (op.getName() != "main")
+      return failure();
+
+    // Verify that the given main has no inputs and results.
+    if (op.getNumArguments() || op.getFunctionType().getNumResults()) {
+      return rewriter.notifyMatchFailure(op, [](Diagnostic &diag) {
+        diag << "expected 'main' to have 0 inputs and 0 results";
+      });
+    }
+
+    // Create a new non-toy function, with the same region.
+    auto func = mlir::func::FuncOp::create(rewriter, op.getLoc(), op.getName(),
+                                           op.getFunctionType());
+    rewriter.inlineRegionBefore(op.getRegion(), func.getBody(), func.end());
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine Conversion Patterns: Print operations
+//===----------------------------------------------------------------------===//
+
+struct PrintOpLowering : public OpConversionPattern<toy::PrintOp> {
+  using OpConversionPattern<toy::PrintOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(toy::PrintOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    // We don't lower "toy.print" in this pass, but we need to update its
+    // operands.
+    rewriter.modifyOpInPlace(op,
+                             [&] { op->setOperands(adaptor.getOperands()); });
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine Conversion Patterns: Return operations
+//===----------------------------------------------------------------------===//
+
+struct ReturnOpLowering : public OpConversionPattern<toy::ReturnOp> {
+  using OpConversionPattern<toy::ReturnOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(toy::ReturnOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    // During this lowering, we expect that all function calls have been
+    // inlined.
+    if (op.hasOperand())
+      return failure();
+
+    // We lower "toy.return" directly to "func.return".
+    rewriter.replaceOpWithNewOp<func::ReturnOp>(op);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine Conversion Patterns: Transpose operations
+//===----------------------------------------------------------------------===//
+
+struct TransposeOpLowering : public OpConversionPattern<toy::TransposeOp> {
+  using OpConversionPattern<toy::TransposeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(toy::TransposeOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    lowerOpToLoops(op, rewriter, [&](OpBuilder &builder, ValueRange loopIvs) {
+      Value input = adaptor.getInput();
+
+      // Transpose the elements by generating a load from the
+      // reverse indices.
+      SmallVector<Value, 2> reverseIvs(llvm::reverse(loopIvs));
+      return affine::AffineLoadOp::create(builder, loc, input, reverseIvs);
+    });
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToAffine RewritePatterns: MatMul operations
+//===----------------------------------------------------------------------===//
+
+struct MatMulOpLowering : public ConversionPattern {
+  MatMulOpLowering(MLIRContext *ctx)
+      : ConversionPattern(toy::MatMulOp::getOperationName(), 1, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+
+    RankedTensorType lhsType =
+        llvm::dyn_cast<RankedTensorType>(op->getOperand(0).getType());
+    RankedTensorType rhsType =
+        llvm::dyn_cast<RankedTensorType>(op->getOperand(1).getType());
+    auto lhsShape = lhsType.getShape();
+    auto rhsShape = rhsType.getShape();
+
+    auto tensorType =
+        llvm::dyn_cast<RankedTensorType>((*op->result_type_begin()));
+
+    auto elemType = llvm::dyn_cast<FloatType>(tensorType.getElementType());
+
+    // Insert an allocation and deallocation for the result of this operation.
+    auto memRefType = convertTensorToMemRef(tensorType);
+    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
+
+    SmallVector<int64_t, 4> lowerBounds(tensorType.getRank() + 1, /*Value=*/0);
+    SmallVector<int64_t, 4> steps(tensorType.getRank() + 1, /*Value=*/1);
+    SmallVector<int64_t, 4> upperBounds{lhsShape[0], rhsShape[0], rhsShape[1]};
+
+    // add initialization of result tensor.
+    // Create a nest of affine loops to initialize the result tensor to 0.
+    affine::buildAffineLoopNest(
+        rewriter, loc, {0, 0}, tensorType.getShape(), {1, 1},
+        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+          // Create a constant float value of 0.0.
+          auto valueToStore = arith::ConstantFloatOp::create(
+              nestedBuilder, loc, elemType,
+              llvm::APFloat::getZero(elemType.getFloatSemantics()));
+
+          // Store the constant value into the allocated memory.
+          affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc,
+                                        ivs);
+        });
+
+    // Create a nest of affine loops for matrix multiplication.
+    affine::buildAffineLoopNest(
+        rewriter, loc, lowerBounds, upperBounds, steps,
+        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
+          // Extract loop induction variables.
+          Value m = ivs[0];
+          Value k = ivs[1];
+          Value n = ivs[2];
+
+          // Create an adaptor for the remapped operands of the MatMulOp.
+          toy::MatMulOpAdaptor matmulAdaptor(operands);
+
+          // Load elements from the left-hand side and right-hand side matrices.
+          auto loadedLhs = affine::AffineLoadOp::create(
+              nestedBuilder, loc, matmulAdaptor.getLhs(), ValueRange{m, k});
+
+          auto loadedRhs = affine::AffineLoadOp::create(
+              nestedBuilder, loc, matmulAdaptor.getRhs(), ValueRange{k, n});
+          // Load elements from the result tensor from initial process above.
+          auto loadedRes = affine::AffineLoadOp::create(
+              nestedBuilder, loc, alloc, ValueRange{m, n});
+
+          // Perform the multiplication and addition operations.
+          auto mulop =
+              arith::MulFOp::create(nestedBuilder, loc, loadedLhs, loadedRhs);
+          auto valueToStore =
+              arith::AddFOp::create(nestedBuilder, loc, loadedRes, mulop);
+
+          // Store the result back into the allocated memory.
+          affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc,
+                                        ValueRange{m, n});
+        });
+
+    // Replace this operation with the generated alloc.
+    rewriter.replaceOp(op, alloc);
+
+    return success();
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// ToyToAffineLoweringPass
+//===----------------------------------------------------------------------===//
+
+/// This is a partial lowering to affine loops of the toy operations that are
+/// computationally intensive (like matmul for example...) while keeping the
+/// rest of the code in the Toy dialect.
+namespace {
+struct ToyToAffineLoweringPass
+    : public PassWrapper<ToyToAffineLoweringPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToAffineLoweringPass)
+  StringRef getArgument() const override { return "toy-to-affine"; }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<affine::AffineDialect, func::FuncDialect,
+                    memref::MemRefDialect>();
+  }
+  void runOnOperation() final;
+};
+} // namespace
+
+void ToyToAffineLoweringPass::runOnOperation() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering.
+  ConversionTarget target(getContext());
+
+  // We define the specific operations, or dialects, that are legal targets for
+  // this lowering. In our case, we are lowering to a combination of the
+  // `Affine`, `Arith`, `Func`, and `MemRef` dialects.
+  target.addLegalDialect<affine::AffineDialect, BuiltinDialect,
+                         arith::ArithDialect, func::FuncDialect,
+                         memref::MemRefDialect>();
+
+  // We also define the Toy dialect as Illegal so that the conversion will fail
+  // if any of these operations are *not* converted. Given that we actually want
+  // a partial lowering, we explicitly mark the Toy operations that don't want
+  // to lower, `toy.print`, as `legal`. `toy.print` will still need its operands
+  // to be updated though (as we convert from TensorType to MemRefType), so we
+  // only treat it as `legal` if its operands are legal.
+  target.addIllegalDialect<toy::ToyDialect>();
+  target.addDynamicallyLegalOp<toy::PrintOp>([](toy::PrintOp op) {
+    return llvm::none_of(op->getOperandTypes(),
+                         [](Type type) { return llvm::isa<TensorType>(type); });
+  });
+
+  // Now that the conversion target has been defined, we just need to provide
+  // the set of patterns that will lower the Toy operations.
+  RewritePatternSet patterns(&getContext());
+  patterns.add<AddOpLowering, ConstantOpLowering, FuncOpLowering, MulOpLowering,
+               PrintOpLowering, ReturnOpLowering, TransposeOpLowering,
+               MatMulOpLowering>(&getContext());
+
+  // With the target and rewrite patterns defined, we can now attempt the
+  // conversion. The conversion will signal failure if any of our `illegal`
+  // operations were not converted successfully.
+  if (failed(
+          applyPartialConversion(getOperation(), target, std::move(patterns))))
+    signalPassFailure();
+}
+
+/// Create a pass for lowering operations in the `Affine` and `Std` dialects,
+/// for a subset of the Toy IR (e.g. matmul).
+std::unique_ptr<Pass> mlir::toy::createLowerToAffinePass() {
+  return std::make_unique<ToyToAffineLoweringPass>();
+}
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp b/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp
new file mode 100644
index 0000000..58e59d9
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp
@@ -0,0 +1,537 @@
+#include "cuda_tile/Dialect/CudaTile/IR/Types.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/LogicalResult.h"
+
+#include "cuda_tile/Dialect/CudaTile/IR/Dialect.h"
+#include "cuda_tile/Dialect/CudaTile/IR/Ops.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#define DEBUG_TYPE "toy-to-cuda-tile"
+
+void debugPrintShape(mlir::ArrayRef<int64_t> shape,
+                     llvm::StringRef prefix = "") {
+  std::string shapeStr;
+  llvm::raw_string_ostream shapeOS(shapeStr);
+  shapeOS << "[";
+  llvm::interleaveComma(shape, shapeOS);
+  shapeOS << "]";
+  shapeOS.flush();
+  LDBG() << prefix << shapeStr;
+}
+
+mlir::cuda_tile::MakeTensorViewOp
+makeTensorViewForArg(mlir::OpBuilder &rewriter, mlir::Location loc,
+                     mlir::Value arg, mlir::ArrayRef<int64_t> shape) {
+  auto resultType = rewriter.getI64ArrayAttr(shape);
+  LDBG() << "shape: " << resultType;
+  auto ptrElem =
+      llvm::dyn_cast<mlir::cuda_tile::TileType>(arg.getType()).getElementType();
+  auto eleType =
+      llvm::dyn_cast<mlir::cuda_tile::PointerType>(ptrElem).getPointeeType();
+  mlir::cuda_tile::TensorViewType tensorViewType =
+      mlir::cuda_tile::TensorViewType::get(rewriter.getContext(), eleType,
+                                           shape,
+                                           /*strides=*/{shape.back(), 1});
+  // LDBG() << "Creating TensorViewType: " << tensorViewType;
+  auto make_tensor_view = mlir::cuda_tile::MakeTensorViewOp::create(
+      rewriter, loc, tensorViewType, arg,
+      /*dynamicShape=*/mlir::ValueRange{},
+      /*dynamicStrides=*/mlir::ValueRange{});
+  return make_tensor_view;
+}
+
+int64_t alignPower2(int x) {
+  int64_t power = 1;
+  while (power < x) {
+    power *= 2;
+  }
+  return power;
+}
+
+static bool isFromFuncArg(mlir::Value v) {
+  if (auto barg = llvm::dyn_cast<mlir::BlockArgument>(v)) {
+    return true;
+  }
+  return false;
+}
+
+static std::optional<mlir::Value> insertUnrealizedConversionCastOp(
+    mlir::Value opv, mlir::Value v, mlir::RankedTensorType logical,
+    mlir::PatternRewriter &rewriter, mlir::Location loc) {
+  auto tileTy = llvm::dyn_cast<mlir::cuda_tile::TileType>(v.getType());
+  if (!tileTy)
+    return v;
+
+  auto elemTy = tileTy.getElementType();
+  auto ptrTy = llvm::dyn_cast<mlir::cuda_tile::PointerType>(elemTy);
+  if (!ptrTy)
+    return v;
+
+  if (!isFromFuncArg(opv)) {
+    auto logicalTy = llvm::cast<mlir::RankedTensorType>(opv.getType());
+    auto alignedI32 = llvm::to_vector<4>(
+        llvm::map_range(logicalTy.getShape(), [](int64_t dim) {
+          return static_cast<int32_t>(alignPower2(dim));
+        }));
+    llvm::SmallVector<int64_t, 4> alignedShape;
+    alignedShape.reserve(alignedI32.size());
+    for (int32_t dim : alignedI32)
+      alignedShape.push_back(static_cast<int64_t>(dim));
+    mlir::cuda_tile::TileType resultTileTy =
+        mlir::cuda_tile::TileType::get(alignedShape, ptrTy.getPointeeType());
+
+    // Here the TypeConverter will change the toy.add result type to
+    // tile<ptr<f32>>, but we actually need tile<...xf32> to do computation. so
+    // we need to insert a cast here. if we don't do this, the
+    // `UnrealizedConversionCastOp` will be automatically inserted later during
+    // conversion. like: %10 = "builtin.unrealized_conversion_cast"(%9)
+    // {__pure_type_conversion__}
+    //                      : (!cuda_tile.tile<2x4xf32>) ->
+    //                      !cuda_tile.tile<ptr<f32>>
+    // since the TypeConverter can not know which input is from function arg or
+    // not. so, here we do the cast manually to delete those cast Op since the
+    // `cuda_tile.add` can accept tile<...xf32> directly if args is not from the
+    // block arguments.
+    mlir::UnrealizedConversionCastOp castOp =
+        mlir::UnrealizedConversionCastOp::create(rewriter, loc, {resultTileTy},
+                                                 v);
+    return castOp.getResult(0);
+  }
+
+  return std::nullopt; // no need to insert cast
+}
+
+static mlir::cuda_tile::MakePartitionViewOp
+makePartitionViewForArg(mlir::PatternRewriter &rewriter, mlir::Location loc,
+                        mlir::Value v, mlir::RankedTensorType logical) {
+  // 1) make_tensor_view from tile<ptr<f32>>
+  auto tensorView = makeTensorViewForArg(rewriter, loc, v, logical.getShape());
+
+  // 2) 创建 partition_view，tile 形状使用 2 的幂对齐
+  auto alignedI32 =
+      llvm::to_vector<4>(llvm::map_range(logical.getShape(), [](int64_t dim) {
+        return static_cast<int32_t>(alignPower2(dim));
+      }));
+  auto partViewTy = mlir::cuda_tile::PartitionViewType::get(
+      rewriter.getContext(), rewriter.getDenseI32ArrayAttr(alignedI32),
+      llvm::dyn_cast<mlir::cuda_tile::TensorViewType>(
+          tensorView->getResult(0).getType()),
+      /*partitions=*/{0, 1}, {});
+  auto partView = mlir::cuda_tile::MakePartitionViewOp::create(
+      rewriter, loc, partViewTy, tensorView);
+  return partView;
+}
+
+static mlir::Value ensureTileValue(mlir::Value opv, mlir::Value v,
+                                   mlir::RankedTensorType logical,
+                                   mlir::PatternRewriter &rewriter) {
+  auto loc = v.getLoc();
+  auto maybeCastV =
+      insertUnrealizedConversionCastOp(opv, v, logical, rewriter, loc);
+  if (maybeCastV.has_value()) {
+    return maybeCastV.value();
+  }
+
+  auto alignedI32 =
+      llvm::to_vector<4>(llvm::map_range(logical.getShape(), [](int64_t dim) {
+        return static_cast<int32_t>(alignPower2(dim));
+      }));
+
+  auto partView = makePartitionViewForArg(rewriter, loc, v, logical);
+
+  // 3) 准备索引常量和 load
+  auto i32TileTy = mlir::cuda_tile::TileType::get({}, rewriter.getI32Type());
+  auto zeroAttr =
+      mlir::DenseIntElementsAttr::get(i32TileTy, llvm::ArrayRef<int32_t>{0});
+  auto zeroIdx =
+      mlir::cuda_tile::ConstantOp::create(rewriter, loc, i32TileTy, zeroAttr);
+
+  auto memOrd = mlir::cuda_tile::MemoryOrderingSemanticsAttr::get(
+      rewriter.getContext(), mlir::cuda_tile::MemoryOrderingSemantics::WEAK);
+  auto tokenTy = mlir::cuda_tile::TokenType::get(rewriter.getContext());
+
+  // auto memory_ordering_attr =
+  //     mlir::cuda_tile::MemoryOrderingSemanticsAttr::get(
+  //         rewriter.getContext(),
+  //         mlir::cuda_tile::MemoryOrderingSemantics::WEAK);
+
+  auto tensorViewTy = llvm::cast<mlir::cuda_tile::TensorViewType>(
+      partView.getTensorView().getType());
+  LDBG() << "TensorViewType for LoadViewTkoOp: " << tensorViewTy;
+  llvm::SmallVector<int64_t, 4> alignedLoadShape(alignedI32.begin(),
+                                                 alignedI32.end());
+  debugPrintShape(alignedLoadShape);
+
+  auto resTileTy = mlir::cuda_tile::TileType::get(
+      {alignedLoadShape.begin(), alignedLoadShape.end()},
+      tensorViewTy.getElementType());
+  auto load = mlir::cuda_tile::LoadViewTkoOp::create(
+      rewriter, loc, {resTileTy, tokenTy}, memOrd, {}, partView,
+      mlir::ValueRange{zeroIdx, zeroIdx}, {}, {});
+
+  return load.getResult(0);
+}
+
+static mlir::Value ensureStoreValue(mlir::Value opv, mlir::Value v,
+                                    mlir::RankedTensorType logical,
+                                    mlir::PatternRewriter &rewriter) {
+  auto loc = v.getLoc();
+  auto castOp =
+      insertUnrealizedConversionCastOp(opv, v, logical, rewriter, loc);
+  if (castOp.has_value()) {
+    return castOp.value();
+  }
+  return v;
+}
+
+//===----------------------------------------------------------------------===//
+// 1) TypeConverter: tensor<...xf32> -> tile<ptr<f32>> (plus we will create
+// views)
+//===----------------------------------------------------------------------===//
+struct ToyToCudaTileTypeConverter : public mlir::TypeConverter {
+  ToyToCudaTileTypeConverter(mlir::MLIRContext *ctx) {
+    addConversion([](mlir::Type t) { return t; }); // identity for others
+
+    addConversion([&](mlir::RankedTensorType t) -> mlir::Type {
+      // Example: only handle f32 ranked tensor for now.
+      auto elemTy = llvm::dyn_cast<mlir::FloatType>(t.getElementType());
+      if (!elemTy || elemTy.getWidth() != 32)
+        return {};
+
+      auto ptrElem = mlir::cuda_tile::PointerType::get(elemTy);
+      auto newType = mlir::cuda_tile::TileType::get({}, ptrElem);
+
+      // tile<ptr<f32>> : the exact spelling depends on your cuda_tile dialect
+      // types.
+      return newType;
+    });
+
+    // Important: if you have tensor results too, you need a materialization
+    // strategy. e.g. create temporary buffers and store into them, or return
+    // ptr to output.
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// 2) Pattern: toy.gpu_func -> create cuda_tile.module entry
+//===----------------------------------------------------------------------===//
+struct LowerToyGPUFuncToCudaTileEntry
+    : public mlir::OpConversionPattern<mlir::toy::GPUFuncOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::toy::GPUFuncOp op,
+                  mlir::toy::GPUFuncOp::Adaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    // Find / create cuda_tile.module container (you can also create once in
+    // pass)
+    auto moduleOp = op->getParentOfType<mlir::ModuleOp>();
+    mlir::cuda_tile::ModuleOp cudaMod;
+    for (auto m : moduleOp.getOps<mlir::cuda_tile::ModuleOp>()) {
+      cudaMod = m;
+      break;
+    }
+
+    if (!cudaMod) {
+      rewriter.setInsertionPointToEnd(moduleOp.getBody());
+      cudaMod = mlir::cuda_tile::ModuleOp::create(rewriter, op.getLoc(),
+                                                  "cuda_tile_module");
+    }
+    LDBG() << "Found / Created CudaTile Module: \n" << cudaMod;
+
+    llvm::SmallVector<mlir::Type> entryArgTys;
+    llvm::SmallVector<llvm::ArrayRef<int64_t>, 4> entryArgShapes;
+    for (auto t : op.getFunctionType().getInputs()) {
+      auto ct = getTypeConverter()->convertType(t);
+      if (!ct)
+        return rewriter.notifyMatchFailure(op, "cannot convert arg type");
+      LDBG() << "Converted arg type: " << ct;
+      entryArgTys.push_back(ct);
+      auto rt = llvm::dyn_cast<mlir::RankedTensorType>(t);
+      entryArgShapes.push_back(rt.getShape());
+    }
+
+    for (auto t : op.getFunctionType().getResults()) {
+      auto ct = getTypeConverter()->convertType(t);
+      if (!ct)
+        return rewriter.notifyMatchFailure(op, "cannot convert result type");
+      LDBG() << "Converted result type: " << ct;
+      // Optionally, add as extra arg instead of return.
+      entryArgTys.push_back(ct);
+      auto rt = llvm::dyn_cast<mlir::RankedTensorType>(t);
+      entryArgShapes.push_back(rt.getShape());
+    }
+    auto newFnType = rewriter.getFunctionType(entryArgTys, {});
+
+    mlir::Block &bodyBlock = cudaMod.getBodyRegion().front();
+    mlir::OpBuilder::InsertionGuard guard(rewriter);
+
+    rewriter.setInsertionPointToStart(&bodyBlock);
+
+    auto entry = mlir::cuda_tile::EntryOp::create(
+        rewriter, op.getLoc(), op.getSymName(), newFnType,
+        /*arg_attrs=*/{}, /*res_attrs=*/{}, {});
+
+    LDBG() << "CudaTile Module: \n" << cudaMod;
+
+    auto *bb = entry.addEntryBlock();
+
+    rewriter.setInsertionPointToStart(bb);
+    // 1. create a get_tile_block_id op
+    auto tileBlockId = mlir::cuda_tile::GetTileBlockIdOp::create(
+        rewriter, op->getLoc(),
+        {mlir::cuda_tile::TileType::get({}, rewriter.getI32Type()),
+         mlir::cuda_tile::TileType::get({}, rewriter.getI32Type()),
+         mlir::cuda_tile::TileType::get({}, rewriter.getI32Type())});
+
+    llvm::SmallVector<mlir::Value> tensorViews;
+
+    // for (auto [idx, arg] : llvm::enumerate(bb->getArguments())) {
+    //   // 2. create a make_tensor_view op
+    //   auto resultType = rewriter.getI64ArrayAttr(entryArgShapes[idx]);
+    //   LDBG() << "Argument " << idx << " : " << arg << ", shape: " <<
+    //   resultType; auto ptrElem =
+    //   llvm::dyn_cast<mlir::cuda_tile::TileType>(arg.getType())
+    //                      .getElementType();
+    //   auto eleType = llvm::dyn_cast<mlir::cuda_tile::PointerType>(ptrElem)
+    //                      .getPointeeType();
+    //   mlir::cuda_tile::TensorViewType tensorViewType =
+    //       mlir::cuda_tile::TensorViewType::get(
+    //           rewriter.getContext(), eleType, entryArgShapes[idx],
+    //           /*strides=*/{entryArgShapes[idx].back(), 1});
+    //   // LDBG() << "Creating TensorViewType: " << tensorViewType;
+    //   auto make_tensor_view = mlir::cuda_tile::MakeTensorViewOp::create(
+    //       rewriter, op->getLoc(), tensorViewType, arg,
+    //       /*dynamicShape=*/mlir::ValueRange{},
+    //       /*dynamicStrides=*/mlir::ValueRange{});
+    //   // LDBG() << "Created MakeTensorViewOp: \n" << make_tensor_view  ;
+    //   tensorViews.push_back(make_tensor_view.getResult());
+    // }
+    for (auto [idx, arg] : llvm::enumerate(bb->getArguments())) {
+      tensorViews.push_back(arg);
+    }
+
+    auto *srcBlock = &op.getBody().front();
+    llvm::SmallVector<mlir::Value> argValues;
+    argValues.reserve(srcBlock->getNumArguments());
+    for (unsigned i = 0; i < srcBlock->getNumArguments(); ++i) {
+      argValues.push_back(tensorViews[i]);
+    }
+
+    auto *srcTerminator = srcBlock->getTerminator();
+    rewriter.mergeBlocks(srcBlock, bb, argValues);
+
+    auto retOp = mlir::cuda_tile::ReturnOp::create(rewriter, op.getLoc());
+
+    LDBG() << "Created CudaTile Entry Op: \n" << entry;
+
+    // Erase old op.
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToCudaTile Conversion Patterns: Binary operations
+//===----------------------------------------------------------------------===//
+
+template <typename BinaryOp, typename LoweredBinaryOp>
+struct BinaryOpLowering : public mlir::OpConversionPattern<BinaryOp> {
+  using mlir::OpConversionPattern<BinaryOp>::OpConversionPattern;
+  using OpAdaptor = typename mlir::OpConversionPattern<BinaryOp>::OpAdaptor;
+
+  llvm::LogicalResult
+  matchAndRewrite(BinaryOp op, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const final {
+    auto loc = op->getLoc();
+    auto logical =
+        llvm::dyn_cast<mlir::RankedTensorType>(op.getResult().getType());
+    if (!logical) {
+      return rewriter.notifyMatchFailure(op, "result is not RankedTensorType");
+    }
+    auto lhsLoaded =
+        ensureTileValue(op.getLhs(), adaptor.getLhs(), logical, rewriter);
+    auto rhsLoaded =
+        ensureTileValue(op.getRhs(), adaptor.getRhs(), logical, rewriter);
+
+    LDBG() << "After ensureTileValue LHS: " << lhsLoaded;
+    LDBG() << "After ensureTileValue RHS: " << rhsLoaded;
+
+    auto tileTy = lhsLoaded.getType();
+    auto binOp = LoweredBinaryOp::create(rewriter, loc, tileTy, lhsLoaded,
+                                         rhsLoaded, {});
+    rewriter.replaceOp(op, binOp.getResult());
+    return llvm::success();
+  }
+};
+using AddOpLowering =
+    BinaryOpLowering<mlir::toy::AddOp, mlir::cuda_tile::AddFOp>;
+using MulOpLowering =
+    BinaryOpLowering<mlir::toy::MulOp, mlir::cuda_tile::MulFOp>;
+
+//===----------------------------------------------------------------------===//
+// ToyToCudaTile Conversion Patterns: Return operations
+//===----------------------------------------------------------------------===//
+
+struct ReturnLowering : public mlir::OpConversionPattern<mlir::toy::ReturnOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::toy::ReturnOp op, mlir::toy::ReturnOp::Adaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto outputPtr = op->getBlock()->getArguments().back();
+    auto logical =
+        llvm::dyn_cast<mlir::RankedTensorType>(op->getOperand(0).getType());
+    if (!logical) {
+      return rewriter.notifyMatchFailure(op, "result is not RankedTensorType");
+    }
+    auto retValLoaded = ensureStoreValue(
+        op.getOperand(0), adaptor.getOperands().front(), logical, rewriter);
+    LDBG() << "After ensureStoreValue RET: " << retValLoaded;
+
+    auto partView = makePartitionViewForArg(rewriter, loc, outputPtr, logical);
+
+    auto tkTy = mlir::cuda_tile::TokenType::get(rewriter.getContext());
+    auto memoryOrd = mlir::cuda_tile::MemoryOrderingSemanticsAttr::get(
+        rewriter.getContext(), mlir::cuda_tile::MemoryOrderingSemantics::WEAK);
+
+    auto i32TileTy = mlir::cuda_tile::TileType::get({}, rewriter.getI32Type());
+    auto zeroAttr =
+        mlir::DenseIntElementsAttr::get(i32TileTy, llvm::ArrayRef<int32_t>{0});
+    auto zeroIdx =
+        mlir::cuda_tile::ConstantOp::create(rewriter, loc, i32TileTy, zeroAttr);
+
+    auto storeOp = mlir::cuda_tile::StoreViewTkoOp::create(
+        rewriter, loc, {tkTy}, memoryOrd, {}, retValLoaded, partView,
+        mlir::ValueRange{zeroIdx, zeroIdx}, {}, {});
+
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// ToyToCudaTileLoweringPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ToyToCudaTileLoweringPass
+    : public mlir::PassWrapper<ToyToCudaTileLoweringPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToCudaTileLoweringPass)
+
+  llvm::StringRef getArgument() const override { return "toy-to-cuda-tile"; }
+
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlir::cuda_tile::CudaTileDialect>();
+  }
+
+  void runOnOperation() final;
+};
+}; // namespace
+
+mlir::cuda_tile::ModuleOp createCudaModuleOp(mlir::OpBuilder &builder,
+                                             mlir::ModuleOp &moduleOp) {
+  mlir::OpBuilder::InsertionGuard guard(builder);
+
+  builder.setInsertionPoint(moduleOp.getBody(), moduleOp.getBody()->end());
+  auto cudaTileModuleOp = mlir::cuda_tile::ModuleOp::create(
+      builder, moduleOp.getLoc(), "cuda_tile_module");
+
+  LDBG() << "Created CudaTile Module: \n" << cudaTileModuleOp;
+  return cudaTileModuleOp;
+}
+
+void ToyToCudaTileLoweringPass::runOnOperation() {
+  auto moduleOp = getOperation();
+  auto *ctx = moduleOp.getContext();
+  mlir::ConversionTarget target(*ctx);
+  target.addLegalDialect<mlir::cuda_tile::CudaTileDialect>();
+  target.addLegalOp<mlir::ModuleOp>();
+
+  // Keep host-side toy.func/main legal (or lower it later).
+  target.addLegalOp<mlir::toy::FuncOp, mlir::toy::PrintOp,
+                    mlir::toy::ConstantOp, mlir::toy::LaunchGpuOp>();
+
+  target.addIllegalOp<mlir::toy::GPUFuncOp, mlir::toy::AddOp, mlir::toy::MulOp,
+                      mlir::toy::ReturnOp>();
+
+  moduleOp.walk([&](mlir::toy::GPUFuncOp gfun) {
+    ToyToCudaTileTypeConverter typeConverter(ctx);
+    mlir::RewritePatternSet patterns(ctx);
+
+    patterns.add<LowerToyGPUFuncToCudaTileEntry, MulOpLowering, AddOpLowering,
+                 ReturnLowering>(typeConverter, ctx);
+
+    if (mlir::failed(
+            mlir::applyFullConversion(gfun, target, std::move(patterns))))
+      signalPassFailure();
+  });
+
+  // -------------------------------
+  // auto moduleOp = getOperation();
+  // auto *ctx = moduleOp.getContext();
+  // // The first thing to define is the conversion target. This will define the
+  // // final target for this lowering.
+  // mlir::ConversionTarget target(*ctx);
+
+  // target.addLegalDialect<mlir::cuda_tile::CudaTileDialect>();
+  // target.addLegalOp<mlir::ModuleOp>();
+
+  // // Keep host-side toy.func/main legal (or lower it later).
+  // target.addLegalOp<mlir::toy::FuncOp, mlir::toy::PrintOp,
+  //                   mlir::toy::ConstantOp, mlir::toy::LaunchGpuOp>();
+
+  // target
+  //     .addIllegalOp<mlir::toy::GPUFuncOp, mlir::toy::AddOp,
+  //     mlir::toy::MulOp,mlir::toy::ReturnOp>();
+
+  // ToyToCudaTileTypeConverter typeConv(&*ctx);
+
+  // mlir::RewritePatternSet patterns(&*ctx);
+  // patterns.add<LowerToyGPUFuncToCudaTileEntry>(typeConv, &*ctx);
+
+  // // TODO: add patterns for toy.transpose/toy.matmul/toy.add/toy.mul
+  // patterns.add<MulOpLowering, AddOpLowering>(typeConv, ctx);
+
+  // if (mlir::failed(mlir::applyPartialConversion(moduleOp, target,
+  //                                               std::move(patterns)))) {
+  //   signalPassFailure();
+  // }
+  // -------------------------------
+}
+
+namespace mlir::toy {
+
+std::unique_ptr<mlir::Pass> createCudaTileLoweringPass() {
+  return std::make_unique<ToyToCudaTileLoweringPass>();
+};
+
+}; // namespace mlir::toy
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToGpu.cpp b/mlir/cuda-tile/Toy/mlir/LowerToGpu.cpp
new file mode 100644
index 0000000..cf2bb4b
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/LowerToGpu.cpp
@@ -0,0 +1,287 @@
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+
+#include <memory>
+#include <string>
+
+#define DEBUG_TYPE "toy-gpu-outline"
+
+namespace {
+
+static bool isGpuOperation(mlir::Operation *op,
+                           const llvm::SmallSet<llvm::StringRef, 4> &gpuOps) {
+  llvm::StringRef opName = op->getName().getStringRef().split('.').second;
+  return gpuOps.contains(opName);
+}
+
+static llvm::SmallVector<int64_t, 3> parseGrid(llvm::StringRef gridStr) {
+  llvm::SmallVector<int64_t, 3> dims;
+  llvm::SmallVector<llvm::StringRef, 4> pieces;
+  gridStr.split(pieces, ',');
+  for (llvm::StringRef piece : pieces) {
+    int64_t value = 0;
+    if (!piece.empty() && llvm::to_integer(piece.trim(), value))
+      dims.push_back(value);
+  }
+  if (dims.size() != 3)
+    dims = {1, 1, 1};
+  return dims;
+}
+
+struct GpuOutlinePass
+    : public mlir::PassWrapper<GpuOutlinePass,
+                               mlir::OperationPass<mlir::toy::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GpuOutlinePass)
+
+  std::string grid{"1,1,1"};
+
+  llvm::StringRef getArgument() const override { return "toy-gpu-outline"; }
+
+  void initializeOptions(std::string grid) { this->grid = grid; }
+
+  void runOnOperation() override {
+    auto func = getOperation();
+    if (func.getName() != "main")
+      return;
+
+    llvm::SmallSet<llvm::StringRef, 4> gpuOperations = {"matmul", "add", "mul",
+                                                        "transpose"};
+
+    // // Collect GPU-eligible ops in block order for deterministic cloning.
+    // llvm::SmallDenseSet<mlir::Operation *, 8> gpuOpSet;
+    // llvm::SmallVector<mlir::Operation *> gpuOps;
+
+    // for (mlir::Operation &op : func.front()) {
+    //   if (isGpuOperation(&op, gpuOperations)) {
+    //     gpuOpSet.insert(&op);
+    //     gpuOps.push_back(&op);
+    //   }
+    // }
+
+    // if (gpuOps.empty())
+    //   return;
+
+    llvm::SmallVector<int64_t, 3> gridDims = parseGrid(grid);
+
+    llvm::SmallVector<llvm::SmallVector<mlir::Operation *>> gpuSubgraphs;
+
+    // Find a gpu subgraph like
+    // [[gpuOps, ...], [gpuOps, ...], ...]
+    // original sequence:
+    // [..., non-gpu-op, [gpu-op, gpu-op], non-gpu-op, [gpu-op, ...]]
+    func.walk([&](mlir::Operation *op) {
+      if (isGpuOperation(op, gpuOperations)) {
+        if (gpuSubgraphs.empty()) {
+          gpuSubgraphs.push_back({op});
+        } else {
+          gpuSubgraphs.back().push_back(op);
+        }
+      } else {
+        if (gpuSubgraphs.empty()) {
+          gpuSubgraphs.push_back({});
+        } else if (!gpuSubgraphs.back().empty()) {
+          gpuSubgraphs.push_back({});
+        }
+      }
+    });
+
+    if (gpuSubgraphs.empty())
+      return;
+
+    bool allEmpty = llvm::all_of(
+        gpuSubgraphs, [](const llvm::SmallVector<mlir::Operation *> &sg) {
+          return sg.empty();
+        });
+
+    if (allEmpty)
+      return;
+
+    if (gpuSubgraphs.back().empty()) {
+      gpuSubgraphs.pop_back();
+    }
+
+    for (const auto &gpuSubgraph : gpuSubgraphs) {
+      LDBG() << "----GPU subgraph----";
+      for (const auto &op : gpuSubgraph) {
+        LDBG() << *op;
+      }
+      LDBG() << "--------------------";
+    }
+
+    llvm::SmallVector<std::string> outlinedFuncNames;
+    llvm::SmallVector<mlir::Operation *> insertPoints;
+
+    // the logic to outline each gpu subgraph
+    // 1. find operands or input for the subgraph (exclude the input inside
+    // subgraph).
+    // 2. find results or output for the subgraph (exclude the output inside
+    // subgraph).
+    // 3. create a new function with operands as input and results as output.
+    // 4. insert a LaunchGpuOp to call the outlined function at the insert point
+
+    for (const auto &[index, gpuSubgraph] : llvm::enumerate(gpuSubgraphs)) {
+      if (!gpuSubgraph.empty()) {
+        LDBG() << "----GPU subgraph----";
+        for (const auto &op : gpuSubgraph) {
+          LDBG() << *op;
+        }
+
+        // Identify its operands.
+        llvm::SmallVector<mlir::Value, 8> Operands;
+        llvm::SmallPtrSet<mlir::Value, 8> OperandSet;
+        for (mlir::Operation *op : gpuSubgraph) {
+          for (mlir::Value operand : op->getOperands()) {
+            auto *def = operand.getDefiningOp();
+            if (!def || !isGpuOperation(def, gpuOperations)) {
+              if (OperandSet.insert(operand).second)
+                Operands.push_back(operand);
+            }
+          }
+        }
+
+        LDBG() << "Operands:";
+        for (mlir::Value &operand : Operands) {
+          LDBG() << "  " << operand;
+        }
+
+        llvm::SmallVector<mlir::Value, 2> Results;
+        llvm::SmallPtrSet<mlir::Value, 2> ResultSet;
+
+        for (mlir::Operation *op : gpuSubgraph) {
+          for (mlir::Value result : op->getResults()) {
+            bool escapes =
+                llvm::any_of(result.getUsers(), [&](mlir::Operation *user) {
+                  return !isGpuOperation(user, gpuOperations);
+                });
+            if (escapes && ResultSet.insert(result).second)
+              Results.push_back(result);
+          }
+        }
+
+        LDBG() << "Results:";
+        for (mlir::Value &result : Results) {
+          LDBG() << "  " << result;
+        }
+
+        if (Results.size() != 1) {
+          llvm::errs()
+              << "Currently only support single result GPU kernel "
+              << "Since the toy return op only supports single return value "
+              << "Found " << Results.size() << " results";
+          return signalPassFailure();
+        }
+
+        // buid the kernel for each subgraph
+        llvm::SmallVector<mlir::Type, 8> argTypes;
+        argTypes.reserve(Operands.size());
+        for (mlir::Value v : Operands)
+          argTypes.push_back(v.getType());
+
+        llvm::SmallVector<mlir::Type> resultTypes;
+        resultTypes.reserve(Results.size());
+        for (mlir::Value v : Results)
+          resultTypes.push_back(v.getType());
+
+        mlir::ModuleOp module = func->getParentOfType<mlir::ModuleOp>();
+        mlir::SymbolTable symbolTable(module);
+        std::string outline_func_name =
+            "outlined_gpu_kernel_" + std::to_string(index);
+
+        unsigned suffix = 0;
+        while (symbolTable.lookup(outline_func_name))
+          outline_func_name =
+              outline_func_name + "_" + std::to_string(++suffix);
+
+        insertPoints.push_back(gpuSubgraph.front());
+
+        {
+          mlir::OpBuilder moduleBuilder(module.getContext());
+          mlir::OpBuilder::InsertionGuard guard(moduleBuilder);
+          moduleBuilder.setInsertionPointToEnd(module.getBody());
+          auto funcType = moduleBuilder.getFunctionType(argTypes, resultTypes);
+          auto gpuFunc = mlir::toy::GPUFuncOp::create(
+              moduleBuilder, func.getLoc(), outline_func_name, funcType);
+
+          mlir::Block &kernelEntry = gpuFunc.getBody().front();
+          mlir::OpBuilder kernelBuilder =
+              mlir::OpBuilder::atBlockEnd(&kernelEntry);
+
+          mlir::IRMapping mapping;
+          for (auto [blockArg, captured] :
+               llvm::zip(kernelEntry.getArguments(), Operands))
+            mapping.map(captured, blockArg);
+
+          for (mlir::Operation *op : gpuSubgraph) {
+            kernelBuilder.clone(*op, mapping);
+          }
+          llvm::SmallVector<mlir::Value> mappedResults;
+          mappedResults.reserve(Results.size());
+          for (mlir::Value res : Results)
+            mappedResults.push_back(mapping.lookup(res));
+          mlir::toy::ReturnOp::create(kernelBuilder, func.getLoc(),
+                                      mappedResults);
+
+          LDBG() << "Created GPU kernel: " << gpuFunc;
+        }
+
+        outlinedFuncNames.push_back(outline_func_name);
+
+        {
+          mlir::OpBuilder hostBuilder(func.getContext());
+          mlir::OpBuilder::InsertionGuard guard(hostBuilder);
+          // Insert the host launch in place of the first outlined op.
+          hostBuilder.setInsertionPoint(gpuSubgraph.back()->getNextNode());
+
+          auto calleeAttr = mlir::SymbolRefAttr::get(
+              func.getContext(), llvm::StringRef(outline_func_name));
+
+          auto gridAttr = hostBuilder.getDenseI64ArrayAttr(gridDims);
+
+          auto launch = mlir::toy::LaunchGpuOp::create(
+              hostBuilder, func.getLoc(), resultTypes, Operands,
+              {{"callee", calleeAttr}, {"grid", gridAttr}});
+
+          for (auto [idx, res] : llvm::enumerate(Results))
+            res.replaceAllUsesWith(launch.getResult(idx));
+
+          for (mlir::Operation *op : llvm::reverse(gpuSubgraph))
+            op->erase();
+          LDBG() << "Inserted LaunchGpuOp: " << launch;
+        }
+        LDBG() << "--------------------";
+      }
+    }
+  };
+};
+}; // namespace
+
+namespace mlir::toy {
+
+std::unique_ptr<mlir::Pass> createGpuOutlinePass(std::string grid) {
+  auto pass = std::make_unique<GpuOutlinePass>();
+  pass->initializeOptions(grid); // You can change the grid dimensions here
+  return pass;
+};
+
+}; // namespace mlir::toy
diff --git a/mlir/cuda-tile/Toy/mlir/LowerToLLVM.cpp b/mlir/cuda-tile/Toy/mlir/LowerToLLVM.cpp
new file mode 100644
index 0000000..ad6c5bb
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/LowerToLLVM.cpp
@@ -0,0 +1,248 @@
+//====- LowerToLLVM.cpp - Lowering from Toy+Affine+Std to LLVM ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements full lowering of Toy operations to LLVM MLIR dialect.
+// 'toy.print' is lowered to a loop nest that calls `printf` on each element of
+// the input array. The file also sets up the ToyToLLVMLoweringPass. This pass
+// lowers the combination of Arithmetic + Affine + SCF + Func dialects to the
+// LLVM one:
+//
+//                         Affine --
+//                                  |
+//                                  v
+//                       Arithmetic + Func --> LLVM (Dialect)
+//                                  ^
+//                                  |
+//     'toy.print' --> Loop (SCF) --
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/Casting.h"
+#include <memory>
+#include <utility>
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// ToyToLLVM Conversion Patterns
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Lowers `toy.print` to a loop nest calling `printf` on each of the individual
+/// elements of the array.
+class PrintOpLowering : public OpConversionPattern<toy::PrintOp> {
+public:
+  using OpConversionPattern<toy::PrintOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(toy::PrintOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *context = rewriter.getContext();
+    auto memRefType = llvm::cast<MemRefType>((*op->operand_type_begin()));
+    auto memRefShape = memRefType.getShape();
+    auto loc = op->getLoc();
+
+    ModuleOp parentModule = op->getParentOfType<ModuleOp>();
+
+    // Get a symbol reference to the printf function, inserting it if necessary.
+    auto printfRef = getOrInsertPrintf(rewriter, parentModule);
+    Value formatSpecifierCst = getOrCreateGlobalString(
+        loc, rewriter, "frmt_spec", StringRef("%f \0", 4), parentModule);
+    Value newLineCst = getOrCreateGlobalString(
+        loc, rewriter, "nl", StringRef("\n\0", 2), parentModule);
+
+    // Create a loop for each of the dimensions within the shape.
+    SmallVector<Value, 4> loopIvs;
+    for (unsigned i = 0, e = memRefShape.size(); i != e; ++i) {
+      auto lowerBound = arith::ConstantIndexOp::create(rewriter, loc, 0);
+      auto upperBound =
+          arith::ConstantIndexOp::create(rewriter, loc, memRefShape[i]);
+      auto step = arith::ConstantIndexOp::create(rewriter, loc, 1);
+      auto loop =
+          scf::ForOp::create(rewriter, loc, lowerBound, upperBound, step);
+      for (Operation &nested : make_early_inc_range(*loop.getBody()))
+        rewriter.eraseOp(&nested);
+      loopIvs.push_back(loop.getInductionVar());
+
+      // Terminate the loop body.
+      rewriter.setInsertionPointToEnd(loop.getBody());
+
+      // Insert a newline after each of the inner dimensions of the shape.
+      if (i != e - 1)
+        LLVM::CallOp::create(rewriter, loc, getPrintfType(context), printfRef,
+                             newLineCst);
+      scf::YieldOp::create(rewriter, loc);
+      rewriter.setInsertionPointToStart(loop.getBody());
+    }
+
+    // Generate a call to printf for the current element of the loop.
+    auto elementLoad =
+        memref::LoadOp::create(rewriter, loc, op.getInput(), loopIvs);
+
+    // Varargs promotion: float -> double
+    Value arg = elementLoad;
+    Type t = elementLoad.getType();
+    if (t.isF32()) {
+      arg = arith::ExtFOp::create(rewriter, loc, rewriter.getF64Type(), arg);
+    } else if (!t.isF64()) {
+      return rewriter.notifyMatchFailure(op, "toy.print only supports f32/f64");
+    }
+
+    LLVM::CallOp::create(rewriter, loc, getPrintfType(context), printfRef,
+                         ArrayRef<Value>({formatSpecifierCst, arg}));
+
+    // Notify the rewriter that this operation has been removed.
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  /// Create a function declaration for printf, the signature is:
+  ///   * `i32 (i8*, ...)`
+  static LLVM::LLVMFunctionType getPrintfType(MLIRContext *context) {
+    auto llvmI32Ty = IntegerType::get(context, 32);
+    auto llvmPtrTy = LLVM::LLVMPointerType::get(context);
+    auto llvmFnType = LLVM::LLVMFunctionType::get(llvmI32Ty, llvmPtrTy,
+                                                  /*isVarArg=*/true);
+    return llvmFnType;
+  }
+
+  /// Return a symbol reference to the printf function, inserting it into the
+  /// module if necessary.
+  static FlatSymbolRefAttr getOrInsertPrintf(PatternRewriter &rewriter,
+                                             ModuleOp module) {
+    auto *context = module.getContext();
+    if (module.lookupSymbol<LLVM::LLVMFuncOp>("printf"))
+      return SymbolRefAttr::get(context, "printf");
+
+    // Insert the printf function into the body of the parent module.
+    PatternRewriter::InsertionGuard insertGuard(rewriter);
+    rewriter.setInsertionPointToStart(module.getBody());
+    LLVM::LLVMFuncOp::create(rewriter, module.getLoc(), "printf",
+                             getPrintfType(context));
+    return SymbolRefAttr::get(context, "printf");
+  }
+
+  /// Return a value representing an access into a global string with the given
+  /// name, creating the string if necessary.
+  static Value getOrCreateGlobalString(Location loc, OpBuilder &builder,
+                                       StringRef name, StringRef value,
+                                       ModuleOp module) {
+    // Create the global at the entry of the module.
+    LLVM::GlobalOp global;
+    if (!(global = module.lookupSymbol<LLVM::GlobalOp>(name))) {
+      OpBuilder::InsertionGuard insertGuard(builder);
+      builder.setInsertionPointToStart(module.getBody());
+      auto type = LLVM::LLVMArrayType::get(
+          IntegerType::get(builder.getContext(), 8), value.size());
+      global = LLVM::GlobalOp::create(builder, loc, type, /*isConstant=*/true,
+                                      LLVM::Linkage::Internal, name,
+                                      builder.getStringAttr(value),
+                                      /*alignment=*/0);
+    }
+
+    // Get the pointer to the first character in the global string.
+    Value globalPtr = LLVM::AddressOfOp::create(builder, loc, global);
+    Value cst0 = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
+                                          builder.getIndexAttr(0));
+    return LLVM::GEPOp::create(
+        builder, loc, LLVM::LLVMPointerType::get(builder.getContext()),
+        global.getType(), globalPtr, ArrayRef<Value>({cst0, cst0}));
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// ToyToLLVMLoweringPass
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ToyToLLVMLoweringPass
+    : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToLLVMLoweringPass)
+  StringRef getArgument() const override { return "toy-to-llvm"; }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
+  }
+  void runOnOperation() final;
+};
+} // namespace
+
+void ToyToLLVMLoweringPass::runOnOperation() {
+  // The first thing to define is the conversion target. This will define the
+  // final target for this lowering. For this lowering, we are only targeting
+  // the LLVM dialect.
+  LLVMConversionTarget target(getContext());
+  target.addLegalOp<ModuleOp>();
+
+  // During this lowering, we will also be lowering the MemRef types, that are
+  // currently being operated on, to a representation in LLVM. To perform this
+  // conversion we use a TypeConverter as part of the lowering. This converter
+  // details how one type maps to another. This is necessary now that we will be
+  // doing more complicated lowerings, involving loop region arguments.
+  LLVMTypeConverter typeConverter(&getContext());
+
+  // Now that the conversion target has been defined, we need to provide the
+  // patterns used for lowering. At this point of the compilation process, we
+  // have a combination of `toy`, `affine`, and `std` operations. Luckily, there
+  // are already exists a set of patterns to transform `affine` and `std`
+  // dialects. These patterns lowering in multiple stages, relying on transitive
+  // lowerings. Transitive lowering, or A->B->C lowering, is when multiple
+  // patterns must be applied to fully transform an illegal operation into a
+  // set of legal ones.
+  RewritePatternSet patterns(&getContext());
+  populateAffineToStdConversionPatterns(patterns);
+  populateSCFToControlFlowConversionPatterns(patterns);
+  mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
+  populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns);
+  cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
+  populateFuncToLLVMConversionPatterns(typeConverter, patterns);
+
+  // The only remaining operation to lower from the `toy` dialect, is the
+  // PrintOp.
+  patterns.add<PrintOpLowering>(&getContext());
+
+  // We want to completely lower to LLVM, so we use a `FullConversion`. This
+  // ensures that only legal operations will remain after the conversion.
+  auto module = getOperation();
+  if (failed(applyFullConversion(module, target, std::move(patterns))))
+    signalPassFailure();
+}
+
+/// Create a pass for lowering operations the remaining `Toy` operations, as
+/// well as `Affine` and `Std`, to the LLVM dialect for codegen.
+std::unique_ptr<mlir::Pass> mlir::toy::createLowerToLLVMPass() {
+  return std::make_unique<ToyToLLVMLoweringPass>();
+}
diff --git a/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp b/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp
new file mode 100644
index 0000000..bc1a972
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp
@@ -0,0 +1,468 @@
+//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple IR generation targeting MLIR from a Module AST
+// for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/MLIRGen.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Verifier.h"
+#include "toy/Lexer.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <optional>
+#include <vector>
+
+using namespace mlir::toy;
+using namespace toy;
+
+using llvm::ArrayRef;
+using llvm::cast;
+using llvm::dyn_cast;
+using llvm::isa;
+using llvm::ScopedHashTableScope;
+using llvm::SmallVector;
+using llvm::StringRef;
+using llvm::Twine;
+
+namespace {
+
+/// Implementation of a simple MLIR emission from the Toy AST.
+///
+/// This will emit operations that are specific to the Toy language, preserving
+/// the semantics of the language and (hopefully) allow to perform accurate
+/// analysis and transformation based on these high level semantics.
+class MLIRGenImpl {
+public:
+  MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {}
+
+  /// Public API: convert the AST for a Toy module (source file) to an MLIR
+  /// Module operation.
+  mlir::ModuleOp mlirGen(ModuleAST &moduleAST) {
+    // We create an empty MLIR module and codegen functions one at a time and
+    // add them to the module.
+    theModule = mlir::ModuleOp::create(builder.getUnknownLoc());
+
+    for (FunctionAST &f : moduleAST)
+      mlirGen(f);
+
+    // Verify the module after we have finished constructing it, this will check
+    // the structural properties of the IR and invoke any specific verifiers we
+    // have on the Toy operations.
+    if (failed(mlir::verify(theModule))) {
+      theModule.emitError("module verification error");
+      return nullptr;
+    }
+
+    return theModule;
+  }
+
+private:
+  /// A "module" matches a Toy source file: containing a list of functions.
+  mlir::ModuleOp theModule;
+
+  /// The builder is a helper class to create IR inside a function. The builder
+  /// is stateful, in particular it keeps an "insertion point": this is where
+  /// the next operations will be introduced.
+  mlir::OpBuilder builder;
+
+  /// The symbol table maps a variable name to a value in the current scope.
+  /// Entering a function creates a new scope, and the function arguments are
+  /// added to the mapping. When the processing of a function is terminated, the
+  /// scope is destroyed and the mappings created in this scope are dropped.
+  llvm::ScopedHashTable<StringRef, mlir::Value> symbolTable;
+
+  /// Helper conversion for a Toy AST location to an MLIR location.
+  mlir::Location loc(const Location &loc) {
+    return mlir::FileLineColLoc::get(builder.getStringAttr(*loc.file), loc.line,
+                                     loc.col);
+  }
+
+  /// Declare a variable in the current scope, return success if the variable
+  /// wasn't declared yet.
+  llvm::LogicalResult declare(llvm::StringRef var, mlir::Value value) {
+    if (symbolTable.count(var))
+      return mlir::failure();
+    symbolTable.insert(var, value);
+    return mlir::success();
+  }
+
+  /// Create the prototype for an MLIR function with as many arguments as the
+  /// provided Toy AST prototype.
+  mlir::toy::FuncOp mlirGen(PrototypeAST &proto) {
+    auto location = loc(proto.loc());
+
+    // This is a generic function, the return type will be inferred later.
+    // Arguments type are uniformly unranked tensors.
+    llvm::SmallVector<mlir::Type, 4> argTypes(proto.getArgs().size(),
+                                              getType(VarType{}));
+    auto funcType = builder.getFunctionType(argTypes, /*results=*/{});
+    return mlir::toy::FuncOp::create(builder, location, proto.getName(),
+                                     funcType);
+  }
+
+  /// Emit a new function and add it to the MLIR module.
+  mlir::toy::FuncOp mlirGen(FunctionAST &funcAST) {
+    // Create a scope in the symbol table to hold variable declarations.
+    ScopedHashTableScope<llvm::StringRef, mlir::Value> varScope(symbolTable);
+
+    // Create an MLIR function for the given prototype.
+    builder.setInsertionPointToEnd(theModule.getBody());
+    mlir::toy::FuncOp function = mlirGen(*funcAST.getProto());
+    if (!function)
+      return nullptr;
+
+    // Let's start the body of the function now!
+    mlir::Block &entryBlock = function.front();
+    auto protoArgs = funcAST.getProto()->getArgs();
+
+    // Declare all the function arguments in the symbol table.
+    for (const auto nameValue :
+         llvm::zip(protoArgs, entryBlock.getArguments())) {
+      if (failed(declare(std::get<0>(nameValue)->getName(),
+                         std::get<1>(nameValue))))
+        return nullptr;
+    }
+
+    // Set the insertion point in the builder to the beginning of the function
+    // body, it will be used throughout the codegen to create operations in this
+    // function.
+    builder.setInsertionPointToStart(&entryBlock);
+
+    // Emit the body of the function.
+    if (mlir::failed(mlirGen(*funcAST.getBody()))) {
+      function.erase();
+      return nullptr;
+    }
+
+    // Implicitly return void if no return statement was emitted.
+    // FIXME: we may fix the parser instead to always return the last expression
+    // (this would possibly help the REPL case later)
+    ReturnOp returnOp;
+    if (!entryBlock.empty())
+      returnOp = dyn_cast<ReturnOp>(entryBlock.back());
+    if (!returnOp) {
+      ReturnOp::create(builder, loc(funcAST.getProto()->loc()));
+    } else if (returnOp.hasOperand()) {
+      // Otherwise, if this return operation has an operand then add a result to
+      // the function.
+      function.setType(builder.getFunctionType(
+          function.getFunctionType().getInputs(), getType(VarType{})));
+    }
+
+    // If this function isn't main, then set the visibility to private.
+    if (funcAST.getProto()->getName() != "main")
+      function.setPrivate();
+
+    return function;
+  }
+
+  /// Emit a binary operation
+  mlir::Value mlirGen(BinaryExprAST &binop) {
+    // First emit the operations for each side of the operation before emitting
+    // the operation itself. For example if the expression is `a + foo(a)`
+    // 1) First it will visiting the LHS, which will return a reference to the
+    //    value holding `a`. This value should have been emitted at declaration
+    //    time and registered in the symbol table, so nothing would be
+    //    codegen'd. If the value is not in the symbol table, an error has been
+    //    emitted and nullptr is returned.
+    // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted
+    //    and the result value is returned. If an error occurs we get a nullptr
+    //    and propagate.
+    //
+    mlir::Value lhs = mlirGen(*binop.getLHS());
+    if (!lhs)
+      return nullptr;
+    mlir::Value rhs = mlirGen(*binop.getRHS());
+    if (!rhs)
+      return nullptr;
+    auto location = loc(binop.loc());
+
+    // Derive the operation name from the binary operator. At the moment we only
+    // support '+' and '*'.
+    switch (binop.getOp()) {
+    case '+':
+      return AddOp::create(builder, location, lhs, rhs);
+    case '*':
+      return MulOp::create(builder, location, lhs, rhs);
+    }
+
+    emitError(location, "invalid binary operator '") << binop.getOp() << "'";
+    return nullptr;
+  }
+
+  /// This is a reference to a variable in an expression. The variable is
+  /// expected to have been declared and so should have a value in the symbol
+  /// table, otherwise emit an error and return nullptr.
+  mlir::Value mlirGen(VariableExprAST &expr) {
+    if (auto variable = symbolTable.lookup(expr.getName()))
+      return variable;
+
+    emitError(loc(expr.loc()), "error: unknown variable '")
+        << expr.getName() << "'";
+    return nullptr;
+  }
+
+  /// Emit a return operation. This will return failure if any generation fails.
+  llvm::LogicalResult mlirGen(ReturnExprAST &ret) {
+    auto location = loc(ret.loc());
+
+    // 'return' takes an optional expression, handle that case here.
+    mlir::Value expr = nullptr;
+    if (ret.getExpr().has_value()) {
+      if (!(expr = mlirGen(**ret.getExpr())))
+        return mlir::failure();
+    }
+
+    // Otherwise, this return operation has zero operands.
+    ReturnOp::create(builder, location,
+                     expr ? ArrayRef(expr) : ArrayRef<mlir::Value>());
+    return mlir::success();
+  }
+
+  /// Emit a literal/constant array. It will be emitted as a flattened array of
+  /// data in an Attribute attached to a `toy.constant` operation.
+  /// See documentation on [Attributes](LangRef.md#attributes) for more details.
+  /// Here is an excerpt:
+  ///
+  ///   Attributes are the mechanism for specifying constant data in MLIR in
+  ///   places where a variable is never allowed [...]. They consist of a name
+  ///   and a concrete attribute value. The set of expected attributes, their
+  ///   structure, and their interpretation are all contextually dependent on
+  ///   what they are attached to.
+  ///
+  /// Example, the source level statement:
+  ///   var a<2, 3> = [[1, 2, 3], [4, 5, 6]];
+  /// will be converted to:
+  ///   %0 = "toy.constant"() {value: dense<tensor<2x3xf32>,
+  ///     [[1.000000e+00, 2.000000e+00, 3.000000e+00],
+  ///      [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf32>
+  ///
+  mlir::Value mlirGen(LiteralExprAST &lit) {
+    auto type = getType(lit.getDims());
+
+    // The attribute is a vector with a floating point value per element
+    // (number) in the array, see `collectData()` below for more details.
+    std::vector<float> data;
+    data.reserve(llvm::product_of(lit.getDims()));
+    collectData(lit, data);
+
+    // The type of this attribute is tensor of 64-bit floating-point with the
+    // shape of the literal.
+    mlir::Type elementType = builder.getF32Type();
+    auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType);
+
+    // This is the actual attribute that holds the list of values for this
+    // tensor literal.
+    auto dataAttribute =
+        mlir::DenseElementsAttr::get(dataType, llvm::ArrayRef(data));
+
+    // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build`
+    // method.
+    return ConstantOp::create(builder, loc(lit.loc()), type, dataAttribute);
+  }
+
+  /// Recursive helper function to accumulate the data that compose an array
+  /// literal. It flattens the nested structure in the supplied vector. For
+  /// example with this array:
+  ///  [[1, 2], [3, 4]]
+  /// we will generate:
+  ///  [ 1, 2, 3, 4 ]
+  /// Individual numbers are represented as floats.
+  /// Attributes are the way MLIR attaches constant to operations.
+  void collectData(ExprAST &expr, std::vector<float> &data) {
+    if (auto *lit = dyn_cast<LiteralExprAST>(&expr)) {
+      for (auto &value : lit->getValues())
+        collectData(*value, data);
+      return;
+    }
+
+    assert(isa<NumberExprAST>(expr) && "expected literal or number expr");
+    data.push_back(cast<NumberExprAST>(expr).getValue());
+  }
+
+  /// Emit a call expression. It emits specific operations for the `transpose`
+  /// builtin. Other identifiers are assumed to be user-defined functions.
+  mlir::Value mlirGen(CallExprAST &call) {
+    llvm::StringRef callee = call.getCallee();
+    auto location = loc(call.loc());
+
+    // Codegen the operands first.
+    SmallVector<mlir::Value, 4> operands;
+    for (auto &expr : call.getArgs()) {
+      auto arg = mlirGen(*expr);
+      if (!arg)
+        return nullptr;
+      operands.push_back(arg);
+    }
+
+    // Builtin calls have their custom operation, meaning this is a
+    // straightforward emission.
+    if (callee == "transpose") {
+      if (call.getArgs().size() != 1) {
+        emitError(location, "MLIR codegen encountered an error: toy.transpose "
+                            "does not accept multiple arguments");
+        return nullptr;
+      }
+      return TransposeOp::create(builder, location, operands[0]);
+    }
+
+    if (callee == "matmul") {
+      if (call.getArgs().size() != 2) {
+        emitError(location, "MLIR codegen encountered an error: toy.matmul "
+                            "expected 2 arguments");
+        return nullptr;
+      }
+      return MatMulOp::create(builder, location, operands[0], operands[1]);
+    }
+
+    // Otherwise this is a call to a user-defined function. Calls to
+    // user-defined functions are mapped to a custom call that takes the callee
+    // name as an attribute.
+    return GenericCallOp::create(builder, location, callee, operands);
+  }
+
+  /// Emit a print expression. It emits specific operations for two builtins:
+  /// transpose(x) and print(x).
+  llvm::LogicalResult mlirGen(PrintExprAST &call) {
+    auto arg = mlirGen(*call.getArg());
+    if (!arg)
+      return mlir::failure();
+
+    PrintOp::create(builder, loc(call.loc()), arg);
+    return mlir::success();
+  }
+
+  /// Emit a constant for a single number (FIXME: semantic? broadcast?)
+  mlir::Value mlirGen(NumberExprAST &num) {
+    return ConstantOp::create(builder, loc(num.loc()), num.getValue());
+  }
+
+  /// Dispatch codegen for the right expression subclass using RTTI.
+  mlir::Value mlirGen(ExprAST &expr) {
+    switch (expr.getKind()) {
+    case toy::ExprAST::Expr_BinOp:
+      return mlirGen(cast<BinaryExprAST>(expr));
+    case toy::ExprAST::Expr_Var:
+      return mlirGen(cast<VariableExprAST>(expr));
+    case toy::ExprAST::Expr_Literal:
+      return mlirGen(cast<LiteralExprAST>(expr));
+    case toy::ExprAST::Expr_Call:
+      return mlirGen(cast<CallExprAST>(expr));
+    case toy::ExprAST::Expr_Num:
+      return mlirGen(cast<NumberExprAST>(expr));
+    default:
+      emitError(loc(expr.loc()))
+          << "MLIR codegen encountered an unhandled expr kind '"
+          << Twine(expr.getKind()) << "'";
+      return nullptr;
+    }
+  }
+
+  /// Handle a variable declaration, we'll codegen the expression that forms the
+  /// initializer and record the value in the symbol table before returning it.
+  /// Future expressions will be able to reference this variable through symbol
+  /// table lookup.
+  mlir::Value mlirGen(VarDeclExprAST &vardecl) {
+    auto *init = vardecl.getInitVal();
+    if (!init) {
+      emitError(loc(vardecl.loc()),
+                "missing initializer in variable declaration");
+      return nullptr;
+    }
+
+    mlir::Value value = mlirGen(*init);
+    if (!value)
+      return nullptr;
+
+    // We have the initializer value, but in case the variable was declared
+    // with specific shape, we emit a "reshape" operation. It will get
+    // optimized out later as needed.
+    if (!vardecl.getType().shape.empty()) {
+      value = ReshapeOp::create(builder, loc(vardecl.loc()),
+                                getType(vardecl.getType()), value);
+    }
+
+    // Register the value in the symbol table.
+    if (failed(declare(vardecl.getName(), value)))
+      return nullptr;
+    return value;
+  }
+
+  /// Codegen a list of expression, return failure if one of them hit an error.
+  llvm::LogicalResult mlirGen(ExprASTList &blockAST) {
+    ScopedHashTableScope<StringRef, mlir::Value> varScope(symbolTable);
+    for (auto &expr : blockAST) {
+      // Specific handling for variable declarations, return statement, and
+      // print. These can only appear in block list and not in nested
+      // expressions.
+      if (auto *vardecl = dyn_cast<VarDeclExprAST>(expr.get())) {
+        if (!mlirGen(*vardecl))
+          return mlir::failure();
+        continue;
+      }
+      if (auto *ret = dyn_cast<ReturnExprAST>(expr.get()))
+        return mlirGen(*ret);
+      if (auto *print = dyn_cast<PrintExprAST>(expr.get())) {
+        if (mlir::failed(mlirGen(*print)))
+          return mlir::success();
+        continue;
+      }
+
+      // Generic expression dispatch codegen.
+      if (!mlirGen(*expr))
+        return mlir::failure();
+    }
+    return mlir::success();
+  }
+
+  /// Build a tensor type from a list of shape dimensions.
+  mlir::Type getType(ArrayRef<int64_t> shape) {
+    // If the shape is empty, then this type is unranked.
+    if (shape.empty())
+      return mlir::UnrankedTensorType::get(builder.getF32Type());
+
+    // Otherwise, we use the given shape.
+    return mlir::RankedTensorType::get(shape, builder.getF32Type());
+  }
+
+  /// Build an MLIR type from a Toy AST variable type (forward to the generic
+  /// getType above).
+  mlir::Type getType(const VarType &type) { return getType(type.shape); }
+};
+
+} // namespace
+
+namespace toy {
+
+// The public API for codegen.
+mlir::OwningOpRef<mlir::ModuleOp> mlirGen(mlir::MLIRContext &context,
+                                          ModuleAST &moduleAST) {
+  return MLIRGenImpl(context).mlirGen(moduleAST);
+}
+
+} // namespace toy
diff --git a/mlir/cuda-tile/Toy/mlir/ShapeInferencePass.cpp b/mlir/cuda-tile/Toy/mlir/ShapeInferencePass.cpp
new file mode 100644
index 0000000..a552e1f
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/ShapeInferencePass.cpp
@@ -0,0 +1,123 @@
+//===- ShapeInferencePass.cpp - Shape Inference ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Function level pass performing interprocedural
+// propagation of array shapes through function specialization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "toy/Dialect.h"
+#include "toy/Passes.h"
+#include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+#define DEBUG_TYPE "shape-inference"
+
+using namespace mlir;
+using namespace toy;
+
+/// Include the auto-generated definitions for the shape inference interfaces.
+#include "toy/ShapeInferenceOpInterfaces.cpp.inc"
+
+namespace {
+/// The ShapeInferencePass is a pass that performs intra-procedural
+/// shape inference.
+///
+///    Algorithm:
+///
+///   1) Build a worklist containing all the operations that return a
+///      dynamically shaped tensor: these are the operations that need shape
+///      inference.
+///   2) Iterate on the worklist:
+///     a) find an operation to process: the next ready operation in the
+///        worklist has all of its arguments non-generic,
+///     b) if no operation is found, break out of the loop,
+///     c) remove the operation from the worklist,
+///     d) infer the shape of its output from the argument types.
+///   3) If the worklist is empty, the algorithm succeeded.
+///
+struct ShapeInferencePass
+    : public mlir::PassWrapper<ShapeInferencePass, OperationPass<toy::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ShapeInferencePass)
+  StringRef getArgument() const override { return "toy-shape-inference"; }
+
+  void runOnOperation() override {
+    auto f = getOperation();
+
+    // Populate the worklist with the operations that need shape inference:
+    // these are operations that return a dynamic shape.
+    llvm::SmallPtrSet<mlir::Operation *, 16> opWorklist;
+    f.walk([&](mlir::Operation *op) {
+      if (returnsDynamicShape(op))
+        opWorklist.insert(op);
+    });
+
+    // Iterate on the operations in the worklist until all operations have been
+    // inferred or no change happened (fix point).
+    while (!opWorklist.empty()) {
+      // Find the next operation ready for inference, that is an operation
+      // with all operands already resolved (non-generic).
+      auto nextop = llvm::find_if(opWorklist, allOperandsInferred);
+      if (nextop == opWorklist.end())
+        break;
+
+      Operation *op = *nextop;
+      opWorklist.erase(op);
+
+      // Ask the operation to infer its output shapes.
+      LDBG() << "Inferring shape for: " << *op;
+      if (auto shapeOp = dyn_cast<ShapeInference>(op)) {
+        shapeOp.inferShapes();
+      } else {
+        op->emitError("unable to infer shape of operation without shape "
+                      "inference interface");
+        return signalPassFailure();
+      }
+    }
+
+    // If the operation worklist isn't empty, this indicates a failure.
+    if (!opWorklist.empty()) {
+      f.emitError("Shape inference failed, ")
+          << opWorklist.size() << " operations couldn't be inferred\n";
+      signalPassFailure();
+    }
+  }
+
+  /// A utility method that returns if the given operation has all of its
+  /// operands inferred.
+  static bool allOperandsInferred(Operation *op) {
+    return llvm::all_of(op->getOperandTypes(), [](Type operandType) {
+      return llvm::isa<RankedTensorType>(operandType);
+    });
+  }
+
+  /// A utility method that returns if the given operation has a dynamically
+  /// shaped result.
+  static bool returnsDynamicShape(Operation *op) {
+    return llvm::any_of(op->getResultTypes(), [](Type resultType) {
+      return !llvm::isa<RankedTensorType>(resultType);
+    });
+  }
+};
+} // namespace
+
+/// Create a Shape Inference pass.
+std::unique_ptr<mlir::Pass> mlir::toy::createShapeInferencePass() {
+  return std::make_unique<ShapeInferencePass>();
+}
diff --git a/mlir/cuda-tile/Toy/mlir/ToyCombine.cpp b/mlir/cuda-tile/Toy/mlir/ToyCombine.cpp
new file mode 100644
index 0000000..f8397c2
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/ToyCombine.cpp
@@ -0,0 +1,68 @@
+//===- ToyCombine.cpp - Toy High Level Optimizer --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a set of simple combiners for optimizing operations in
+// the Toy dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "toy/Dialect.h"
+using namespace mlir;
+using namespace toy;
+
+namespace {
+/// Include the patterns defined in the Declarative Rewrite framework.
+#include "ToyCombine.inc"
+} // namespace
+
+/// This is an example of a c++ rewrite pattern for the TransposeOp. It
+/// optimizes the following scenario: transpose(transpose(x)) -> x
+struct SimplifyRedundantTranspose : public mlir::OpRewritePattern<TransposeOp> {
+  /// We register this pattern to match every toy.transpose in the IR.
+  /// The "benefit" is used by the framework to order the patterns and process
+  /// them in order of profitability.
+  SimplifyRedundantTranspose(mlir::MLIRContext *context)
+      : OpRewritePattern<TransposeOp>(context, /*benefit=*/1) {}
+
+  /// This method attempts to match a pattern and rewrite it. The rewriter
+  /// argument is the orchestrator of the sequence of rewrites. The pattern is
+  /// expected to interact with it to perform any changes to the IR from here.
+  llvm::LogicalResult
+  matchAndRewrite(TransposeOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // Look through the input of the current transpose.
+    mlir::Value transposeInput = op.getOperand();
+    TransposeOp transposeInputOp = transposeInput.getDefiningOp<TransposeOp>();
+
+    // Input defined by another transpose? If not, no match.
+    if (!transposeInputOp)
+      return failure();
+
+    // Otherwise, we have a redundant transpose. Use the rewriter.
+    rewriter.replaceOp(op, {transposeInputOp.getOperand()});
+    return success();
+  }
+};
+
+/// Register our patterns as "canonicalization" patterns on the TransposeOp so
+/// that they can be picked up by the Canonicalization framework.
+void TransposeOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                              MLIRContext *context) {
+  results.add<SimplifyRedundantTranspose>(context);
+}
+
+/// Register our patterns as "canonicalization" patterns on the ReshapeOp so
+/// that they can be picked up by the Canonicalization framework.
+void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                            MLIRContext *context) {
+  results.add<ReshapeReshapeOptPattern, RedundantReshapeOptPattern,
+              FoldConstantReshapeOptPattern>(context);
+}
diff --git a/mlir/cuda-tile/Toy/mlir/ToyCombine.td b/mlir/cuda-tile/Toy/mlir/ToyCombine.td
new file mode 100644
index 0000000..11d7831
--- /dev/null
+++ b/mlir/cuda-tile/Toy/mlir/ToyCombine.td
@@ -0,0 +1,63 @@
+//===- ToyCombine.td - Pattern Match Optimizations for Toy -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines language-specific pattern match optimizations for Toy using
+// Declarative Rewrite Rules (DRR) specified using TableGen records.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TOY_COMBINE
+#define TOY_COMBINE
+
+include "mlir/IR/PatternBase.td"
+include "toy/Ops.td"
+
+/// Note: The DRR definition used for defining patterns is shown below:
+///
+/// class Pattern<
+///    dag sourcePattern, list<dag> resultPatterns,
+///    list<dag> additionalConstraints = [],
+///    dag benefitsAdded = (addBenefit 0)
+/// >;
+
+//===----------------------------------------------------------------------===//
+// Basic Pattern-Match and Rewrite
+//===----------------------------------------------------------------------===//
+
+// Reshape(Reshape(x)) = Reshape(x)
+def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)),
+                                   (ReshapeOp $arg)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite using Native Code Call
+//===----------------------------------------------------------------------===//
+
+// Native Code Calls may be used for more complex transformations using inline
+// C++ and C++ helper functions.
+
+// Reshape(Constant(x)) = x'
+def ReshapeConstant :
+  NativeCodeCall<"$0.reshape(::llvm::cast<ShapedType>($1.getType()))">;
+def FoldConstantReshapeOptPattern : Pat<
+  (ReshapeOp:$res (ConstantOp $arg)),
+  (ConstantOp (ReshapeConstant $arg, $res))>;
+
+//===----------------------------------------------------------------------===//
+// Pattern-Match and Rewrite with Constraints
+//===----------------------------------------------------------------------===//
+
+// DRR allows for constraint checking when the transformation is conditional
+// on operand properties.
+
+// Reshape(x) = x, where input and output shapes are identical
+def TypesAreIdentical : Constraint<CPred<"$0.getType() == $1.getType()">>;
+def RedundantReshapeOptPattern : Pat<
+  (ReshapeOp:$res $arg), (replaceWithValue $arg),
+  [(TypesAreIdentical $res, $arg)]>;
+
+#endif // TOY_COMBINE
diff --git a/mlir/cuda-tile/Toy/parser/AST.cpp b/mlir/cuda-tile/Toy/parser/AST.cpp
new file mode 100644
index 0000000..8416424
--- /dev/null
+++ b/mlir/cuda-tile/Toy/parser/AST.cpp
@@ -0,0 +1,237 @@
+//===- AST.cpp - Helper for printing out the Toy AST ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AST dump for the Toy language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "toy/AST.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace toy;
+
+namespace {
+
+// RAII helper to manage increasing/decreasing the indentation as we traverse
+// the AST
+struct Indent {
+  Indent(int &level) : level(level) { ++level; }
+  ~Indent() { --level; }
+  int &level;
+};
+
+/// Helper class that implement the AST tree traversal and print the nodes along
+/// the way. The only data member is the current indentation level.
+class ASTDumper {
+public:
+  void dump(ModuleAST *node);
+
+private:
+  void dump(const VarType &type);
+  void dump(VarDeclExprAST *varDecl);
+  void dump(ExprAST *expr);
+  void dump(ExprASTList *exprList);
+  void dump(NumberExprAST *num);
+  void dump(LiteralExprAST *node);
+  void dump(VariableExprAST *node);
+  void dump(ReturnExprAST *node);
+  void dump(BinaryExprAST *node);
+  void dump(CallExprAST *node);
+  void dump(PrintExprAST *node);
+  void dump(PrototypeAST *node);
+  void dump(FunctionAST *node);
+
+  // Actually print spaces matching the current indentation level
+  void indent() {
+    for (int i = 0; i < curIndent; i++)
+      llvm::errs() << "  ";
+  }
+  int curIndent = 0;
+};
+
+} // namespace
+
+/// Return a formatted string for the location of any node
+template <typename T>
+static std::string loc(T *node) {
+  const auto &loc = node->loc();
+  return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" +
+          llvm::Twine(loc.col))
+      .str();
+}
+
+// Helper Macro to bump the indentation level and print the leading spaces for
+// the current indentations
+#define INDENT()                                                               \
+  Indent level_(curIndent);                                                    \
+  indent();
+
+/// Dispatch to a generic expressions to the appropriate subclass using RTTI
+void ASTDumper::dump(ExprAST *expr) {
+  llvm::TypeSwitch<ExprAST *>(expr)
+      .Case<BinaryExprAST, CallExprAST, LiteralExprAST, NumberExprAST,
+            PrintExprAST, ReturnExprAST, VarDeclExprAST, VariableExprAST>(
+          [&](auto *node) { this->dump(node); })
+      .Default([&](ExprAST *) {
+        // No match, fallback to a generic message
+        INDENT();
+        llvm::errs() << "<unknown Expr, kind " << expr->getKind() << ">\n";
+      });
+}
+
+/// A variable declaration is printing the variable name, the type, and then
+/// recurse in the initializer value.
+void ASTDumper::dump(VarDeclExprAST *varDecl) {
+  INDENT();
+  llvm::errs() << "VarDecl " << varDecl->getName();
+  dump(varDecl->getType());
+  llvm::errs() << " " << loc(varDecl) << "\n";
+  dump(varDecl->getInitVal());
+}
+
+/// A "block", or a list of expression
+void ASTDumper::dump(ExprASTList *exprList) {
+  INDENT();
+  llvm::errs() << "Block {\n";
+  for (auto &expr : *exprList)
+    dump(expr.get());
+  indent();
+  llvm::errs() << "} // Block\n";
+}
+
+/// A literal number, just print the value.
+void ASTDumper::dump(NumberExprAST *num) {
+  INDENT();
+  llvm::errs() << num->getValue() << " " << loc(num) << "\n";
+}
+
+/// Helper to print recursively a literal. This handles nested array like:
+///    [ [ 1, 2 ], [ 3, 4 ] ]
+/// We print out such array with the dimensions spelled out at every level:
+///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
+static void printLitHelper(ExprAST *litOrNum) {
+  // Inside a literal expression we can have either a number or another literal
+  if (auto *num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
+    llvm::errs() << num->getValue();
+    return;
+  }
+  auto *literal = llvm::cast<LiteralExprAST>(litOrNum);
+
+  // Print the dimension for this literal first
+  llvm::errs() << "<";
+  llvm::interleaveComma(literal->getDims(), llvm::errs());
+  llvm::errs() << ">";
+
+  // Now print the content, recursing on every element of the list
+  llvm::errs() << "[ ";
+  llvm::interleaveComma(literal->getValues(), llvm::errs(),
+                        [&](auto &elt) { printLitHelper(elt.get()); });
+  llvm::errs() << "]";
+}
+
+/// Print a literal, see the recursive helper above for the implementation.
+void ASTDumper::dump(LiteralExprAST *node) {
+  INDENT();
+  llvm::errs() << "Literal: ";
+  printLitHelper(node);
+  llvm::errs() << " " << loc(node) << "\n";
+}
+
+/// Print a variable reference (just a name).
+void ASTDumper::dump(VariableExprAST *node) {
+  INDENT();
+  llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n";
+}
+
+/// Return statement print the return and its (optional) argument.
+void ASTDumper::dump(ReturnExprAST *node) {
+  INDENT();
+  llvm::errs() << "Return\n";
+  if (node->getExpr().has_value())
+    return dump(*node->getExpr());
+  {
+    INDENT();
+    llvm::errs() << "(void)\n";
+  }
+}
+
+/// Print a binary operation, first the operator, then recurse into LHS and RHS.
+void ASTDumper::dump(BinaryExprAST *node) {
+  INDENT();
+  llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n";
+  dump(node->getLHS());
+  dump(node->getRHS());
+}
+
+/// Print a call expression, first the callee name and the list of args by
+/// recursing into each individual argument.
+void ASTDumper::dump(CallExprAST *node) {
+  INDENT();
+  llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n";
+  for (auto &arg : node->getArgs())
+    dump(arg.get());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print a builtin print call, first the builtin name and then the argument.
+void ASTDumper::dump(PrintExprAST *node) {
+  INDENT();
+  llvm::errs() << "Print [ " << loc(node) << "\n";
+  dump(node->getArg());
+  indent();
+  llvm::errs() << "]\n";
+}
+
+/// Print type: only the shape is printed in between '<' and '>'
+void ASTDumper::dump(const VarType &type) {
+  llvm::errs() << "<";
+  llvm::interleaveComma(type.shape, llvm::errs());
+  llvm::errs() << ">";
+}
+
+/// Print a function prototype, first the function name, and then the list of
+/// parameters names.
+void ASTDumper::dump(PrototypeAST *node) {
+  INDENT();
+  llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "\n";
+  indent();
+  llvm::errs() << "Params: [";
+  llvm::interleaveComma(node->getArgs(), llvm::errs(),
+                        [](auto &arg) { llvm::errs() << arg->getName(); });
+  llvm::errs() << "]\n";
+}
+
+/// Print a function, first the prototype and then the body.
+void ASTDumper::dump(FunctionAST *node) {
+  INDENT();
+  llvm::errs() << "Function \n";
+  dump(node->getProto());
+  dump(node->getBody());
+}
+
+/// Print a module, actually loop over the functions and print them in sequence.
+void ASTDumper::dump(ModuleAST *node) {
+  INDENT();
+  llvm::errs() << "Module:\n";
+  for (auto &f : *node)
+    dump(&f);
+}
+
+namespace toy {
+
+// Public API
+void dump(ModuleAST &module) { ASTDumper().dump(&module); }
+
+} // namespace toy
diff --git a/mlir/cuda-tile/Toy/toyc.cpp b/mlir/cuda-tile/Toy/toyc.cpp
new file mode 100644
index 0000000..be27585
--- /dev/null
+++ b/mlir/cuda-tile/Toy/toyc.cpp
@@ -0,0 +1,441 @@
+//===- toyc.cpp - The Toy Compiler ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the entry point for the Toy compiler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h"
+#include "toy/AST.h"
+#include "toy/Dialect.h"
+#include "toy/Lexer.h"
+#include "toy/MLIRGen.h"
+#include "toy/Parser.h"
+#include "toy/Passes.h"
+
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Verifier.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+
+using namespace toy;
+namespace cl = llvm::cl;
+
+static cl::opt<std::string> inputFilename(cl::Positional,
+                                          cl::desc("<input toy file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+namespace {
+enum InputType { Toy, MLIR };
+} // namespace
+static cl::opt<enum InputType> inputType(
+    "x", cl::init(Toy), cl::desc("Decided the kind of output desired"),
+    cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")),
+    cl::values(clEnumValN(MLIR, "mlir",
+                          "load the input file as an MLIR file")));
+
+namespace {
+enum Action {
+  None,
+  DumpAST,
+  DumpMLIR,
+  DumpMLIRAffine,
+  DumpMLIRLLVM,
+  DumpLLVMIR,
+  RunJIT,
+  DumpGpuIR,
+  DumpCudaTileIR,
+  DumpGpuAffine,
+  DumpGPULLVMIR,
+  RunNVGPUJIT
+};
+} // namespace
+static cl::opt<enum Action> emitAction(
+    "emit", cl::desc("Select the kind of output desired"),
+    cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")),
+    cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")),
+    cl::values(clEnumValN(DumpMLIRAffine, "mlir-affine",
+                          "output the MLIR dump after affine lowering")),
+    cl::values(clEnumValN(DumpMLIRLLVM, "mlir-llvm",
+                          "output the MLIR dump after llvm lowering")),
+    cl::values(clEnumValN(DumpLLVMIR, "llvm", "output the LLVM IR dump")),
+    cl::values(
+        clEnumValN(RunJIT, "jit",
+                   "JIT the code and run it by invoking the main function")),
+    cl::values(clEnumValN(DumpGpuIR, "gpu-ir",
+                          "output the GPU dialect MLIR dump")),
+    cl::values(clEnumValN(DumpCudaTileIR, "cuda-tile-ir",
+                          "output the Cuda Tile dialect MLIR dump")),
+    cl::values(clEnumValN(DumpGpuAffine, "gpu-affine",
+                          "output the GPU dialect MLIR dump after affine "
+                          "lowering")),
+    cl::values(clEnumValN(DumpGPULLVMIR, "gpu-llvm",
+                          "output the GPU LLVM dialect MLIR dump")),
+    cl::values(clEnumValN(RunNVGPUJIT, "nv-gpu-jit",
+                          "JIT the code for NVGPU and run it by invoking the "
+                          "main function")));
+
+static cl::opt<std::string> assignGrid("grid", cl::init("1,1,1"),
+                                       cl::desc("Assign the grid dimensions"));
+
+static cl::opt<bool> enableOpt("opt", cl::desc("Enable optimizations"));
+
+/// Returns a Toy AST resulting from parsing the file or a nullptr on error.
+static std::unique_ptr<toy::ModuleAST>
+parseInputFile(llvm::StringRef filename) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(filename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return nullptr;
+  }
+  auto buffer = fileOrErr.get()->getBuffer();
+  LexerBuffer lexer(buffer.begin(), buffer.end(), std::string(filename));
+  Parser parser(lexer);
+  return parser.parseModule();
+}
+
+static int loadMLIR(mlir::MLIRContext &context,
+                    mlir::OwningOpRef<mlir::ModuleOp> &module) {
+  // Handle '.toy' input to the compiler.
+  if (inputType != InputType::MLIR &&
+      !llvm::StringRef(inputFilename).ends_with(".mlir")) {
+    auto moduleAST = parseInputFile(inputFilename);
+    if (!moduleAST)
+      return 6;
+    module = mlirGen(context, *moduleAST);
+    return !module ? 1 : 0;
+  }
+
+  // Otherwise, the input is '.mlir'.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(inputFilename);
+  if (std::error_code ec = fileOrErr.getError()) {
+    llvm::errs() << "Could not open input file: " << ec.message() << "\n";
+    return -1;
+  }
+
+  // Parse the input mlir.
+  llvm::SourceMgr sourceMgr;
+  sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc());
+  module = mlir::parseSourceFile<mlir::ModuleOp>(sourceMgr, &context);
+  if (!module) {
+    llvm::errs() << "Error can't load file " << inputFilename << "\n";
+    return 3;
+  }
+  return 0;
+}
+
+static int loadAndProcessMLIR(mlir::MLIRContext &context,
+                              mlir::OwningOpRef<mlir::ModuleOp> &module) {
+  if (int error = loadMLIR(context, module))
+    return error;
+
+  mlir::PassManager pm(module.get()->getName());
+  // Apply any generic pass manager command line options and run the pipeline.
+  if (mlir::failed(mlir::applyPassManagerCLOptions(pm)))
+    return 4;
+
+  // Check to see what granularity of MLIR we are compiling to.
+  bool isLoweringToAffine = emitAction >= Action::DumpMLIRAffine;
+  bool isLoweringToLLVM = emitAction >= Action::DumpMLIRLLVM;
+
+  if (enableOpt || isLoweringToAffine) {
+    // Inline all functions into main and then delete them.
+    pm.addPass(mlir::createInlinerPass());
+
+    // Now that there is only one function, we can infer the shapes of each of
+    // the operations.
+    mlir::OpPassManager &optPM = pm.nest<mlir::toy::FuncOp>();
+    optPM.addPass(mlir::toy::createShapeInferencePass());
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+  }
+
+  if (isLoweringToAffine) {
+    // Partially lower the toy dialect.
+    pm.addPass(mlir::toy::createLowerToAffinePass());
+
+    // Add a few cleanups post lowering.
+    mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
+    optPM.addPass(mlir::createCanonicalizerPass());
+    optPM.addPass(mlir::createCSEPass());
+
+    // Add optimizations if enabled.
+    if (enableOpt) {
+      optPM.addPass(mlir::affine::createLoopFusionPass());
+      optPM.addPass(mlir::affine::createAffineScalarReplacementPass());
+    }
+  }
+
+  if (isLoweringToLLVM) {
+    // Finish lowering the toy IR to the LLVM dialect.
+    pm.addPass(mlir::toy::createLowerToLLVMPass());
+    // This is necessary to have line tables emitted and basic
+    // debugger working. In the future we will add proper debug information
+    // emission directly from our frontend.
+    pm.addPass(mlir::LLVM::createDIScopeForLLVMFuncOpPass());
+  }
+
+  if (mlir::failed(pm.run(*module)))
+    return 4;
+  return 0;
+}
+
+static int dumpAST() {
+  if (inputType == InputType::MLIR) {
+    llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n";
+    return 5;
+  }
+
+  auto moduleAST = parseInputFile(inputFilename);
+  if (!moduleAST)
+    return 1;
+
+  dump(*moduleAST);
+  return 0;
+}
+
+static int dumpLLVMIR(mlir::ModuleOp module) {
+  // Register the translation to LLVM IR with the MLIR context.
+  mlir::registerBuiltinDialectTranslation(*module->getContext());
+  mlir::registerLLVMDialectTranslation(*module->getContext());
+
+  // Convert the module to LLVM IR in a new LLVM IR context.
+  llvm::LLVMContext llvmContext;
+  auto llvmModule = mlir::translateModuleToLLVMIR(module, llvmContext);
+  if (!llvmModule) {
+    llvm::errs() << "Failed to emit LLVM IR\n";
+    return -1;
+  }
+
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+
+  // Configure the LLVM Module
+  auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost();
+  if (!tmBuilderOrError) {
+    llvm::errs() << "Could not create JITTargetMachineBuilder\n";
+    return -1;
+  }
+
+  auto tmOrError = tmBuilderOrError->createTargetMachine();
+  if (!tmOrError) {
+    llvm::errs() << "Could not create TargetMachine\n";
+    return -1;
+  }
+  mlir::ExecutionEngine::setupTargetTripleAndDataLayout(llvmModule.get(),
+                                                        tmOrError.get().get());
+
+  /// Optionally run an optimization pipeline over the llvm module.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+  if (auto err = optPipeline(llvmModule.get())) {
+    llvm::errs() << "Failed to optimize LLVM IR " << err << "\n";
+    return -1;
+  }
+  llvm::errs() << *llvmModule << "\n";
+  return 0;
+}
+
+static int runJit(mlir::ModuleOp module) {
+  // Initialize LLVM targets.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+
+  // Register the translation from MLIR to LLVM IR, which must happen before we
+  // can JIT-compile.
+  mlir::registerBuiltinDialectTranslation(*module->getContext());
+  mlir::registerLLVMDialectTranslation(*module->getContext());
+
+  // An optimization pipeline to use within the execution engine.
+  auto optPipeline = mlir::makeOptimizingTransformer(
+      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
+      /*targetMachine=*/nullptr);
+
+  // Create an MLIR execution engine. The execution engine eagerly JIT-compiles
+  // the module.
+  mlir::ExecutionEngineOptions engineOptions;
+  engineOptions.transformer = optPipeline;
+  auto maybeEngine = mlir::ExecutionEngine::create(module, engineOptions);
+  assert(maybeEngine && "failed to construct an execution engine");
+  auto &engine = maybeEngine.get();
+
+  // Invoke the JIT-compiled function.
+  auto invocationResult = engine->invokePacked("main");
+  if (invocationResult) {
+    llvm::errs() << "JIT invocation failed\n";
+    return -1;
+  }
+
+  return 0;
+}
+
+static int loadAndProcessMLIRGPU(mlir::MLIRContext &context,
+                                 mlir::OwningOpRef<mlir::ModuleOp> &module) {
+  llvm::SmallSet<llvm::StringRef, 4> gpuOperations = {"matmul", "add", "mul",
+                                                      "transpose"};
+  if (int error = loadMLIR(context, module))
+    return error;
+
+  mlir::PassManager pm(module.get()->getName());
+  // Apply any generic pass manager command line options and run the pipeline.
+  if (mlir::failed(mlir::applyPassManagerCLOptions(pm)))
+    return 4;
+
+  // Inline all functions into main and then delete them.
+  pm.addPass(mlir::createInlinerPass());
+
+  // Now that there is only one function, we can infer the shapes of each of
+  // the operations.
+  mlir::OpPassManager &optPM = pm.nest<mlir::toy::FuncOp>();
+  optPM.addPass(mlir::toy::createShapeInferencePass());
+  optPM.addPass(mlir::createCanonicalizerPass());
+  optPM.addPass(mlir::createCSEPass());
+
+  // Now process the toy mlir with gpu outline pass.
+  optPM.addPass(mlir::toy::createGpuOutlinePass(assignGrid));
+  // mlir::OpPassManager &gpuOptPM = pm.nest<mlir::toy::GPUFuncOp>();
+  pm.addPass(mlir::toy::createCudaTileLoweringPass());
+  pm.addPass(mlir::createCSEPass());
+
+  // pm.addPass(mlir::toy::createLowerGpuHostToLLVMPass());
+  bool isLoweringToAffine = emitAction >= Action::DumpGpuAffine;
+  if (isLoweringToAffine) {
+    pm.addPass(mlir::toy::createEmbedCudaTileBinaryPass(
+        "/usr/local/cuda/bin/tileiras", "sm_120"));
+
+    // mlir::OpPassManager &gpuOptPM = pm.nest<mlir::toy::FuncOp>();
+    // // Partially lower the toy dialect.
+    // pm.addPass(mlir::toy::createLowerToAffinePass());
+
+    //   // Add a few cleanups post lowering.
+    //   mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
+    //   optPM.addPass(mlir::createCanonicalizerPass());
+    //   optPM.addPass(mlir::createCSEPass());
+
+    //   // Add optimizations if enabled.
+    //   if (enableOpt) {
+    //     optPM.addPass(mlir::affine::createLoopFusionPass());
+    //     optPM.addPass(mlir::affine::createAffineScalarReplacementPass());
+    //   }
+  }
+
+  if (mlir::failed(pm.run(*module)))
+    return 4;
+  return 0;
+}
+
+static int dumpGpuLLVMIR(mlir::ModuleOp module) {
+  // Simply dump the MLIR module at this stage.
+  module.dump();
+  return 0;
+}
+
+static int runGpuJit(mlir::ModuleOp module) { return 0; }
+
+int main(int argc, char **argv) {
+  // Register any command line options.
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  mlir::registerPassManagerCLOptions();
+
+  cl::ParseCommandLineOptions(argc, argv, "toy compiler\n");
+
+  if (emitAction == Action::DumpAST)
+    return dumpAST();
+
+  // If we aren't dumping the AST, then we are compiling with/to MLIR.
+  mlir::DialectRegistry registry;
+  mlir::func::registerAllExtensions(registry);
+  mlir::LLVM::registerInlinerInterface(registry);
+
+  mlir::MLIRContext context(registry);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
+
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+
+  if (emitAction > Action::RunJIT) {
+    llvm::outs() << "The GPU related actions will be used\n";
+    llvm::outs() << "Grid dimensions: " << assignGrid << "\n";
+
+    if (int error = loadAndProcessMLIRGPU(context, module))
+      return error;
+
+    // If we aren't exporting to non-mlir, then we are done.
+    bool isOutputingMLIR = emitAction <= Action::RunNVGPUJIT;
+    if (isOutputingMLIR) {
+      module->dump();
+      return 0;
+    }
+
+    if (emitAction == Action::DumpGPULLVMIR)
+      return dumpGpuLLVMIR(*module);
+
+    if (emitAction == Action::RunNVGPUJIT)
+      return runGpuJit(*module);
+
+    llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+    return -1;
+  }
+
+  if (int error = loadAndProcessMLIR(context, module))
+    return error;
+
+  // If we aren't exporting to non-mlir, then we are done.
+  bool isOutputingMLIR = emitAction <= Action::DumpMLIRLLVM;
+  if (isOutputingMLIR) {
+    module->dump();
+    return 0;
+  }
+
+  // Check to see if we are compiling to LLVM IR.
+  if (emitAction == Action::DumpLLVMIR)
+    return dumpLLVMIR(*module);
+
+  // Otherwise, we must be running the jit.
+  if (emitAction == Action::RunJIT)
+    return runJit(*module);
+
+  llvm::errs() << "No action specified (parsing only?), use -emit=<action>\n";
+  return -1;
+}
diff --git a/mlir/cuda-tile/build.sh b/mlir/cuda-tile/build.sh
new file mode 100644
index 0000000..0a84b3a
--- /dev/null
+++ b/mlir/cuda-tile/build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+_target=${1:-'all'}
+
+rm -rf build
+mkdir build
+
+_workspaceFolder=$(pwd)
+
+cd build
+
+# For non-conda users:
+cmake .. -Wno-dev -G Ninja \
+  -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE \
+  -DCMAKE_BUILD_TYPE:STRING=Debug \
+  -DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc \
+  -DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++ \
+  -DMLIR_DIR=${_workspaceFolder}/third_party/llvm/lib/cmake/mlir \
+  -DLLVM_DIR=${_workspaceFolder}/third_party/llvm/lib/cmake/llvm \
+  -DCMAKE_MODULE_PATH="${_workspaceFolder}/third_party/llvm/lib/cmake/mlir;${_workspaceFolder}/third_party/llvm/lib/cmake/llvm" \
+  -DMLIR_TABLEGEN_EXE=${_workspaceFolder}/third_party/llvm/bin/mlir-tblgen \
+  -DCUDA_TILE_BINARY_DIR=${_workspaceFolder}/third_party/cuda-tile/build/ \
+  -DCUDA_TILE_SOURCE_DIR=${_workspaceFolder}/third_party/cuda-tile
+
+# ninja
+cmake \
+  --build ${_workspaceFolder}/build \
+  --config Debug --target ${_target}
diff --git a/mlir/cuda-tile/build_with_conda.sh b/mlir/cuda-tile/build_with_conda.sh
new file mode 100644
index 0000000..c81f22d
--- /dev/null
+++ b/mlir/cuda-tile/build_with_conda.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+_target=${1:-'all'}
+
+rm -rf build
+mkdir build
+
+_workspaceFolder=$(pwd)
+
+cd build
+
+
+cmake ..  -G Ninja --no-warn-unused-cli \
+  -Wno-dev \
+  -DCMAKE_MODULE_PATH="/root/miniconda3/envs/mlir/lib/cmake/mlir;/root/miniconda3/envs/mlir/lib/cmake/llvm" \
+  -DMLIR_TABLEGEN_EXE:FILEPATH=/root/miniconda3/envs/mlir/bin/mlir-tblgen \
+  -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE \
+  -DCMAKE_BUILD_TYPE:STRING=Debug \
+  -DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc \
+  -DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++
+
+# ninja
+cmake \
+  --build ${_workspaceFolder}/build \
+  --config Debug --target ${_target}
diff --git a/mlir/cuda-tile/cuda_shim/cuda_shim.cc b/mlir/cuda-tile/cuda_shim/cuda_shim.cc
new file mode 100644
index 0000000..9a38a9b
--- /dev/null
+++ b/mlir/cuda-tile/cuda_shim/cuda_shim.cc
@@ -0,0 +1,528 @@
+//===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the CUDA library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdlib>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include "cuda.h"
+#include "cuda_bf16.h"
+#include "cuda_fp16.h"
+#include <vector>
+
+// We assume the program runs on the linux platform if not on Windows.
+// Copy from
+// third_party/llvm-project/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+
+// #if CUDA_VERSION >= 13000
+
+#define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default")))
+
+#define CUDA_REPORT_IF_ERROR(expr)                                             \
+  [](CUresult result) {                                                        \
+    if (!result)                                                               \
+      return;                                                                  \
+    const char *name = nullptr;                                                \
+    cuGetErrorName(result, &name);                                             \
+    if (!name)                                                                 \
+      name = "<unknown>";                                                      \
+    fprintf(stderr, "'%s' failed with '%s'\n", #expr, name);                   \
+  }(expr)
+
+thread_local static int32_t defaultDevice = 0;
+
+/// Helper method that checks environment value for debugging.
+static bool isDebugEnabled() {
+  const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG";
+  static bool isEnabled = getenv(kDebugEnvironmentVariable) != nullptr;
+  return isEnabled;
+}
+
+#define debug_print(fmt, ...)                                                  \
+  do {                                                                         \
+    if (isDebugEnabled())                                                      \
+      fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \
+              __func__, __VA_ARGS__);                                          \
+  } while (0)
+
+// Returns default CUdevice
+static CUdevice getDefaultCuDevice() {
+  CUdevice device;
+  CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+  return device;
+}
+
+// Make the primary context of the current default device current for the
+// duration
+//  of the instance and restore the previous context on destruction.
+class ScopedContext {
+public:
+  ScopedContext() {
+    // Static reference to CUDA primary context for device ordinal
+    // defaultDevice.
+    static CUcontext context = [] {
+      CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
+      CUcontext ctx;
+      // Note: this does not affect the current context.
+      CUDA_REPORT_IF_ERROR(
+          cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice()));
+      return ctx;
+    }();
+
+    CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
+  }
+
+  ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
+};
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
+mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
+  ScopedContext scopedContext;
+  CUmodule module = nullptr;
+  CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
+  return module;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
+                                                                int optLevel) {
+  ScopedContext scopedContext;
+  CUmodule module = nullptr;
+  char jitErrorBuffer[4096] = {0};
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+                               CU_JIT_OPTIMIZATION_LEVEL};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
+                            reinterpret_cast<void *>(optLevel)};
+
+  CUresult result =
+      cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
+  if (result) {
+    fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer);
+    CUDA_REPORT_IF_ERROR(result);
+  }
+  return module;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
+  CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction
+mgpuModuleGetFunction(CUmodule module, const char *name) {
+  CUfunction function = nullptr;
+  CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
+  return function;
+}
+
+// The wrapper uses intptr_t instead of CUDA's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
+                 intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+                 intptr_t blockZ, int32_t smem, CUstream stream, void **params,
+                 void **extra, size_t /*paramsCount*/) {
+  ScopedContext scopedContext;
+  if (smem > 0) {
+    // Avoid checking driver as it's more expensive than if statement
+    int32_t maxShmem = 0;
+    CUdevice device = getDefaultCuDevice();
+    CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+    CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
+        &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+        device));
+    if (maxShmem < smem) {
+      fprintf(stderr,
+              "Requested shared memory (%dkb) is larger than maximum allowed "
+              "shared memory (%dkb) for this device\n",
+              smem, maxShmem);
+    }
+    CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
+        function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
+  }
+  debug_print("Launching kernel, grid=%ld,%ld,%ld, "
+              "threads: %ld, %ld, %ld, "
+              "smem: %dkb\n",
+              gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
+  CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
+                                      blockY, blockZ, smem, stream, params,
+                                      extra));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
+  ScopedContext scopedContext;
+  CUstream stream = nullptr;
+  CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+  return stream;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuStreamSynchronize(CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream,
+                                                              CUevent event) {
+  CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() {
+  ScopedContext scopedContext;
+  CUevent event = nullptr;
+  CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
+  return event;
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) {
+  CUDA_REPORT_IF_ERROR(cuEventDestroy(event));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) {
+  CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event,
+                                                          CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) {
+  ScopedContext scopedContext;
+  CUdeviceptr ptr = 0;
+  if (sizeBytes == 0)
+    return reinterpret_cast<void *>(ptr);
+
+  if (isHostShared) {
+    CUDA_REPORT_IF_ERROR(
+        cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL));
+    return reinterpret_cast<void *>(ptr);
+  }
+  CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
+  return reinterpret_cast<void *>(ptr);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr,
+                                                      CUstream /*stream*/) {
+  CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemcpy(void *dst, void *src, size_t sizeBytes, CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst),
+                                     reinterpret_cast<CUdeviceptr>(src),
+                                     sizeBytes, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemset32(void *dst, unsigned int value, size_t count, CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst),
+                                        value, count, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuMemset16(void *dst, unsigned short value, size_t count, CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast<CUdeviceptr>(dst),
+                                        value, count, stream));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
+  defaultDevice = device;
+}
+
+// ===----------------------------------------------------------------------===//
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCtxSynchronize() {
+  ScopedContext scopedContext;
+  CUDA_REPORT_IF_ERROR(cuCtxSynchronize());
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyHtoD(void *dst, void *src,
+                                                         size_t sizeBytes) {
+  CUDA_REPORT_IF_ERROR(
+      cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(dst), src, sizeBytes));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyDtoH(void *dst, void *src,
+                                                         size_t sizeBytes) {
+  CUDA_REPORT_IF_ERROR(
+      cuMemcpyDtoH(dst, reinterpret_cast<CUdeviceptr>(src), sizeBytes));
+}
+
+//===----------------------------------------------------------------------===//
+
+static inline CUdeviceptr asDevPtr(uint64_t h) {
+  return static_cast<CUdeviceptr>(h);
+}
+static inline uint64_t asHandle(CUdeviceptr p) {
+  return static_cast<uint64_t>(p);
+}
+
+static inline CUstream asStream(uint64_t h) {
+  return reinterpret_cast<CUstream>(static_cast<uintptr_t>(h));
+}
+static inline uint64_t asStreamHandle(CUstream s) {
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(s));
+}
+
+static inline CUevent asEvent(uint64_t h) {
+  return reinterpret_cast<CUevent>(static_cast<uintptr_t>(h));
+}
+static inline uint64_t asEventHandle(CUevent e) {
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(e));
+}
+
+static inline void *asHostPtr(uint64_t h) {
+  return reinterpret_cast<void *>(static_cast<uintptr_t>(h));
+}
+static inline const void *asHostCPtr(uint64_t h) {
+  return reinterpret_cast<const void *>(static_cast<uintptr_t>(h));
+}
+
+// Align up helper
+static inline uint64_t alignUp(uint64_t x, uint64_t a) {
+  return (x + (a - 1)) & ~(a - 1);
+}
+
+// Load module from PTX or CUBIN image in memory.
+// Driver API supports cuModuleLoadDataEx for both PTX and cubin (it
+// auto-detects).
+extern "C" uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr,
+                                                     uint64_t image_nbytes) {
+
+  (void)image_nbytes;
+  auto data = const_cast<void *>(asHostCPtr(image_ptr));
+  CUmodule mod = mgpuModuleLoad(data, image_nbytes);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(mod));
+}
+
+extern "C" uint64_t cuda_shim_load_module_jit_from_image(uint64_t image_ptr,
+                                                         uint64_t image_nbytes,
+                                                         int opt_level) {
+
+  (void)image_nbytes;
+  auto data = const_cast<void *>(asHostCPtr(image_ptr));
+  CUmodule mod = mgpuModuleLoadJIT(data, opt_level);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(mod));
+}
+
+extern "C" uint64_t
+cuda_shim_load_module_from_file(uint64_t file_path_ptr,
+                                uint64_t /*file_path_nbytes*/) {
+  auto file_path_cstr =
+      reinterpret_cast<const char *>(asHostCPtr(file_path_ptr));
+  // fprintf(stdout, "%s", file_path_cstr);
+  CUmodule module = nullptr;
+  ScopedContext scopedContext;
+  CUDA_REPORT_IF_ERROR(cuModuleLoad(&module, file_path_cstr));
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(module));
+}
+
+extern "C" void cuda_shim_unload_module(uint64_t module_handle) {
+  CUmodule module =
+      reinterpret_cast<CUmodule>(static_cast<uintptr_t>(module_handle));
+  mgpuModuleUnload(module);
+}
+
+extern "C" uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream,
+                                     bool is_host_shared) {
+  CUstream cu_stream = asStream(stream);
+  if (stream == 0)
+    cu_stream = nullptr;
+  void *ptr = mgpuMemAlloc(nbytes, /*stream=*/cu_stream,
+                           /*isHostShared=*/is_host_shared);
+  return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ptr));
+}
+
+extern "C" void cuda_shim_free(uint64_t dptr, uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  if (stream == 0) {
+    cu_stream = nullptr;
+  }
+  mgpuMemFree(ptr, /*stream=*/cu_stream);
+}
+
+extern "C" void cuda_shim_memset32(uint64_t dptr, uint32_t value,
+                                   uint64_t count_dwords, uint64_t stream) {
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  CUstream cu_stream = asStream(stream);
+  mgpuMemset32(ptr, value, count_dwords, cu_stream);
+}
+
+extern "C" void cuda_shim_memset16(uint64_t dptr, uint32_t value,
+                                   uint64_t count_dwords, uint64_t stream) {
+  void *ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(dptr));
+  CUstream cu_stream = asStream(stream);
+  mgpuMemset16(ptr, value, count_dwords, cu_stream);
+}
+
+extern "C" uint64_t cuda_shim_stream_create(void) {
+  CUstream stream = mgpuStreamCreate();
+  return asStreamHandle(stream);
+}
+
+extern "C" void cuda_shim_stream_destroy(uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  mgpuStreamDestroy(cu_stream);
+}
+
+extern "C" void cuda_shim_stream_synchronize(uint64_t stream) {
+  CUstream cu_stream = asStream(stream);
+  mgpuStreamSynchronize(cu_stream);
+}
+
+extern "C" uint64_t cuda_shim_event_create(void) {
+  CUevent event = mgpuEventCreate();
+  return asEventHandle(event);
+}
+
+extern "C" void cuda_shim_event_destroy(uint64_t ev) {
+  CUevent event = asEvent(ev);
+  mgpuEventDestroy(event);
+}
+
+extern "C" void cuda_shim_event_record(uint64_t ev, uint64_t stream) {
+  CUevent event = asEvent(ev);
+  CUstream cu_stream = asStream(stream);
+  mgpuEventRecord(event, cu_stream);
+}
+
+extern "C" void cuda_shim_event_synchronize(uint64_t ev) {
+  CUevent event = asEvent(ev);
+  mgpuEventSynchronize(event);
+}
+
+extern "C" void cuda_shim_stream_wait_event(uint64_t stream, uint64_t ev) {
+  CUstream cu_stream = asStream(stream);
+  CUevent event = asEvent(ev);
+  mgpuStreamWaitEvent(cu_stream, event);
+}
+
+// ----------------------------- Memcpy (raw ABI) --------------------------
+// Host pointers are passed as uint64_t. This is the key of 2A.
+
+extern "C" void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr,
+                                     uint64_t nbytes) {
+  ScopedContext scopedContext;
+  auto dst = asHostPtr(dst_dptr);
+  auto src = asHostPtr(src_hptr);
+  mgpuMemcpyHtoD(dst, src, static_cast<size_t>(nbytes));
+}
+
+extern "C" void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr,
+                                     uint64_t nbytes) {
+  ScopedContext scopedContext;
+  auto dst = asHostPtr(dst_hptr);
+  auto src = asHostPtr(src_dptr);
+  mgpuMemcpyDtoH(dst, src, static_cast<size_t>(nbytes));
+}
+
+// ----------------------------- Kernel launch -----------------------------
+// The hardest part is kernelParams (void**).
+// We avoid building it in MLIR. Instead MLIR passes:
+// - arg_data_ptr: host pointer to a packed buffer containing raw argument bytes
+// - arg_sizes_ptr: host pointer to uint64_t[num_args], each is the byte-size of
+// that argument The shim constructs kernelParams[i] = &arg_data[offset_i] with
+// 8-byte alignment. This matches typical ABI expectations for scalar/pointer
+// args. If you have special alignment requirements, extend this (e.g., per-arg
+// alignment array).
+
+extern "C" void cuda_shim_launch_packed(
+    uint64_t module_handle, uint64_t kernel_name_ptr, uint32_t gridX,
+    uint32_t gridY, uint32_t gridZ, uint32_t blockX, uint32_t blockY,
+    uint32_t blockZ, uint32_t sharedMemBytes, uint64_t stream,
+    uint64_t arg_data_ptr, uint64_t arg_sizes_ptr, uint32_t num_args) {
+
+  auto mh = reinterpret_cast<CUmodule>(static_cast<uintptr_t>(module_handle));
+  if (!mh) {
+    fprintf(stderr, "[cuda_shim] launch_packed: invalid module handle\n");
+    abort();
+  }
+
+  const char *kname =
+      reinterpret_cast<const char *>(asHostCPtr(kernel_name_ptr));
+  if (!kname) {
+    fprintf(stderr, "[cuda_shim] launch_packed: null kernel name\n");
+    abort();
+  }
+
+  CUfunction fn = mgpuModuleGetFunction(mh, kname);
+
+  auto *argData = reinterpret_cast<uint8_t *>(asHostPtr(arg_data_ptr));
+  auto *argSizes =
+      reinterpret_cast<const uint64_t *>(asHostCPtr(arg_sizes_ptr));
+
+  if (num_args > 0 && (!argData || !argSizes)) {
+    fprintf(stderr, "[cuda_shim] launch_packed: argData/argSizes null\n");
+    abort();
+  }
+
+  // Build kernelParams array on heap (safe for large num_args).
+  std::vector<void *> params;
+  params.resize(num_args);
+
+  uint64_t off = 0;
+  for (uint32_t i = 0; i < num_args; ++i) {
+    // 8-byte align each argument start (common safe default).
+    off = alignUp(off, 8);
+    params[i] = argData + off;
+    off += argSizes[i];
+  }
+
+  auto cu_stream = asStream(stream);
+
+  if (stream == 0) {
+    cu_stream = nullptr;
+  }
+
+  mgpuLaunchKernel(fn, static_cast<intptr_t>(gridX),
+                   static_cast<intptr_t>(gridY), static_cast<intptr_t>(gridZ),
+                   static_cast<intptr_t>(blockX), static_cast<intptr_t>(blockY),
+                   static_cast<intptr_t>(blockZ),
+                   static_cast<int32_t>(sharedMemBytes), cu_stream,
+                   params.data(), nullptr, static_cast<size_t>(num_args));
+}
+
+// Convenience: 1D launch, shared=0, stream optional
+extern "C" void
+cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr,
+                              uint32_t blockX, uint32_t blockY, uint32_t blockZ,
+                              uint64_t stream, uint64_t arg_data_ptr,
+                              uint64_t arg_sizes_ptr, uint32_t num_args) {
+  cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX,
+                          blockY, blockZ, 0, stream, arg_data_ptr,
+                          arg_sizes_ptr, num_args);
+}
+
+// Optional: global sync (avoid in async pipeline; prefer event/stream sync)
+extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); }
+
+// only for debugging
+extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) {
+  auto *p = reinterpret_cast<const float*>(static_cast<uintptr_t>(dptr));
+  for (uint32_t i = 0; i < n; ++i) {
+    fprintf(stderr, "i=%u v=%f\n", i, p[i]);
+  }
+}
+
+// #endif
diff --git a/mlir/cuda-tile/cuda_shim/load_ptx_main.cpp b/mlir/cuda-tile/cuda_shim/load_ptx_main.cpp
new file mode 100644
index 0000000..e641762
--- /dev/null
+++ b/mlir/cuda-tile/cuda_shim/load_ptx_main.cpp
@@ -0,0 +1,187 @@
+// Minimal demo showing how to load a PTX file and launch a kernel via the cuda_shim API.
+// 1) Build PTX for Ada (RTX 4090) for the sample kernel in vector_add.cu:
+//    nvcc -std=c++17 -arch=sm_89 -ptx vector_add.cu -o vector_add.ptx
+// 2) Build this runner together with the shim (nvcc handles the CUDA driver link flags):
+//    nvcc -std=c++17 load_ptx_main.cpp cuda_shim.cc -o load_ptx_demo -lcuda -lcudart
+// 3) Run: ./load_ptx_demo vector_add.ptx vector_add 1048576
+
+// nvcc -std=c++17 --cudart static load_ptx_main.cpp cuda_shim.cc -o load_ptx_demo -lcuda -lcudadevrt -lcudart_static -ldl -lrt -pthread
+// g++-11 -std=c++17 load_ptx_main.cpp cuda_shim.cc -I/usr/local/cuda/include -L/usr/lib/x86_64-linux-gnu -lcuda -ldl -pthread -o load_ptx_demo
+#include <cuda.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// The shim has no public header, so we redeclare the extern "C" hooks we need.
+extern "C" uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr,
+                                                      uint64_t image_nbytes);
+extern "C" uint64_t cuda_shim_load_module_from_file(uint64_t file_path_ptr,
+                                                     uint64_t file_path_nbytes);
+extern "C" void cuda_shim_unload_module(uint64_t module_handle);
+extern "C" uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream,
+                                      bool is_host_shared);
+extern "C" void cuda_shim_free(uint64_t dptr, uint64_t stream);
+extern "C" void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr,
+                                      uint64_t nbytes);
+extern "C" void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr,
+                                      uint64_t nbytes);
+extern "C" uint64_t cuda_shim_stream_create(void);
+extern "C" void cuda_shim_stream_destroy(uint64_t stream);
+extern "C" void cuda_shim_stream_synchronize(uint64_t stream);
+extern "C" void cuda_shim_launch_packed(uint64_t module_handle,
+                                         uint64_t kernel_name_ptr,
+                                         uint32_t gridX, uint32_t gridY,
+                                         uint32_t gridZ, uint32_t blockX,
+                                         uint32_t blockY, uint32_t blockZ,
+                                         uint32_t sharedMemBytes,
+                                         uint64_t stream,
+                                         uint64_t arg_data_ptr,
+                                         uint64_t arg_sizes_ptr,
+                                         uint32_t num_args);
+
+namespace {
+
+// Round up to next multiple of 8 to match cuda_shim_launch_packed's alignment.
+size_t align8(size_t value) { return (value + 7) & ~static_cast<size_t>(7); }
+
+// Load an entire file into a byte buffer.
+bool loadFile(const std::string &path, std::vector<char> &buffer) {
+  std::ifstream file(path, std::ios::binary);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open PTX file: " << path << "\n";
+    return false;
+  }
+  file.seekg(0, std::ios::end);
+  const auto size = static_cast<size_t>(file.tellg());
+  file.seekg(0, std::ios::beg);
+  buffer.resize(size);
+  file.read(buffer.data(), buffer.size());
+  return true;
+}
+
+// Append a trivially copyable argument into the packed arg buffer.
+template <typename T>
+void appendArg(std::vector<uint8_t> &argData, std::vector<uint64_t> &argSizes,
+               const T &value) {
+  const size_t aligned = align8(argData.size());
+  if (aligned > argData.size()) {
+    argData.resize(aligned, 0);
+  }
+  const uint8_t *ptr = reinterpret_cast<const uint8_t *>(&value);
+  argData.insert(argData.end(), ptr, ptr + sizeof(T));
+  argSizes.push_back(static_cast<uint64_t>(sizeof(T)));
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    std::cerr
+        << "Usage: " << argv[0]
+        << " <path/to/kernel.ptx> [kernel_name=vector_add] [num_elements=1048576]\n";
+    return 1;
+  }
+
+  const std::string ptxPath = argv[1];
+  const std::string kernelName = (argc >= 3) ? argv[2] : std::string("vector_add");
+  const int numElems = (argc >= 4) ? std::atoi(argv[3]) : (1 << 20);
+  const size_t numBytes = static_cast<size_t>(numElems) * sizeof(float);
+
+  // std::vector<char> ptx;
+  // if (!loadFile(ptxPath, ptx)) {
+  //   return 1;
+  // }
+
+  // Load module from the PTX blob.
+  const uint64_t module_handle_for_launch =
+      cuda_shim_load_module_from_file(
+          reinterpret_cast<uint64_t>(ptxPath.data()),
+          static_cast<uint64_t>(ptxPath.size()));
+  if (module_handle_for_launch == 0) {
+    std::cerr << "Failed to load module from PTX: " << ptxPath << "\n";
+    return 1;
+  }
+
+  // cuda_shim_launch_packed expects a pointer to a CUmodule stored in host
+  // memory. Keep a stack copy and pass its address to satisfy that ABI.
+  // CUmodule module = reinterpret_cast<CUmodule>(module_handle_raw);
+  // const uint64_t module_handle_for_launch =
+  //     reinterpret_cast<uint64_t>(&module);
+
+  const uint64_t stream = cuda_shim_stream_create();
+
+  // Allocate device buffers.
+  const uint64_t dOut = cuda_shim_malloc(numBytes, stream, /*is_host_shared=*/false);
+  const uint64_t dA = cuda_shim_malloc(numBytes, stream, /*is_host_shared=*/false);
+  const uint64_t dB = cuda_shim_malloc(numBytes, stream, /*is_host_shared=*/false);
+
+  std::vector<float> hA(numElems);
+  std::vector<float> hB(numElems);
+  std::vector<float> hOut(numElems, 0.0f);
+  for (int i = 0; i < numElems; ++i) {
+    hA[i] = static_cast<float>(i) * 0.5f;
+    hB[i] = static_cast<float>(i) * 1.5f;
+  }
+
+  cuda_shim_memcpy_h2d(dA, reinterpret_cast<uint64_t>(hA.data()), numBytes);
+  cuda_shim_memcpy_h2d(dB, reinterpret_cast<uint64_t>(hB.data()), numBytes);
+
+  // Pack kernel arguments: (float* out, const float* a, const float* b, int n)
+  std::vector<uint8_t> argData;
+  std::vector<uint64_t> argSizes;
+  const uint64_t argOut = dOut;
+  const uint64_t argA = dA;
+  const uint64_t argB = dB;
+  const int argN = numElems;
+
+  appendArg(argData, argSizes, argA);
+  appendArg(argData, argSizes, argB);
+  appendArg(argData, argSizes, argOut);
+  appendArg(argData, argSizes, argN);
+
+  const uint32_t blockX = 256;
+  const uint32_t gridX = static_cast<uint32_t>((numElems + blockX - 1) / blockX);
+
+  cuda_shim_launch_packed(
+      module_handle_for_launch,
+      reinterpret_cast<uint64_t>(kernelName.c_str()),
+      gridX, 1, 1,
+      blockX, 1, 1,
+      /*sharedMemBytes=*/0,
+      stream,
+      reinterpret_cast<uint64_t>(argData.data()),
+      reinterpret_cast<uint64_t>(argSizes.data()),
+      static_cast<uint32_t>(argSizes.size()));
+
+  cuda_shim_stream_synchronize(stream);
+
+  cuda_shim_memcpy_d2h(reinterpret_cast<uint64_t>(hOut.data()), dOut, numBytes);
+
+  // Quick correctness check.
+  bool ok = true;
+  for (int i = 0; i < numElems; ++i) {
+    const float expect = hA[i] + hB[i];
+    if (std::abs(hOut[i] - expect) > 1e-5f) {
+      std::cerr << "Mismatch at index " << i << ": got " << hOut[i]
+                << ", expected " << expect << "\n";
+      ok = false;
+      break;
+    }
+  }
+
+  std::cout << (ok ? "Success" : "Failure") << " for " << numElems
+            << " elements" << std::endl;
+
+  cuda_shim_free(dOut, stream);
+  cuda_shim_free(dA, stream);
+  cuda_shim_free(dB, stream);
+  cuda_shim_stream_destroy(stream);
+  cuda_shim_unload_module(module_handle_for_launch);
+
+  return ok ? 0 : 1;
+}
diff --git a/mlir/cuda-tile/cuda_shim/outlined_gpu_kernel.cu b/mlir/cuda-tile/cuda_shim/outlined_gpu_kernel.cu
new file mode 100644
index 0000000..08f9e0f
--- /dev/null
+++ b/mlir/cuda-tile/cuda_shim/outlined_gpu_kernel.cu
@@ -0,0 +1,21 @@
+// please run inside `nvidia/cuda:12.4.1-devel-ubuntu22.04` container if you
+// want to use the 4090 RTX GPU with 12.4<= cuda <= 13.0.
+// pelase compile with the command:
+//  nvcc -std=c++17 -arch=sm_89 -cubin outlined_gpu_kernel.cu -o cuda_tile.cubin
+#include <cuda_runtime.h>
+#include <cstdio>
+
+extern "C" __global__ void outlined_gpu_kernel_0(const float* a0, const float* a1,
+                                                 const float* a2, float* out) {
+  // 2x4 = 8 elements, row-major with stride (4,1)
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid >= 8) return;
+
+  // out[tid] = a0[tid]*a1[tid] + a2[tid]*a1[tid]
+  float x0 = a0[tid];
+  float x1 = a1[tid];
+  float x2 = a2[tid];
+  out[tid] = x0 * x1 + x2 * x1;
+  // 等价：out[tid] = (x0 + x2) * x1;
+}
+// A = [[1,2,3,9],[4,5,6,10]] B = [[11,12,13,114],[15,16,17,18]]
diff --git a/mlir/cuda-tile/cuda_shim/vector_add.cu b/mlir/cuda-tile/cuda_shim/vector_add.cu
new file mode 100644
index 0000000..7973dd2
--- /dev/null
+++ b/mlir/cuda-tile/cuda_shim/vector_add.cu
@@ -0,0 +1,9 @@
+// Simple vector add kernel for PTX generation targeting Ada (RTX 4090).
+// nvcc -std=c++17 -arch=sm_89 -ptx vector_add.cu -o vector_add.ptx
+extern "C" __global__ void vector_add(const float *a, const float *b,
+                                       float *out, int n) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    out[idx] = a[idx] + b[idx];
+  }
+}
diff --git a/mlir/cuda-tile/explore/example-nvvm.mlir b/mlir/cuda-tile/explore/example-nvvm.mlir
new file mode 100644
index 0000000..39b9883
--- /dev/null
+++ b/mlir/cuda-tile/explore/example-nvvm.mlir
@@ -0,0 +1,60 @@
+module attributes {gpu.container_module} {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  gpu.binary @kernels  [#gpu.object<#nvvm.target<O = 3, chip = "sm_120">, properties = {ISAToBinaryTimeInMs = 8 : i64, LLVMIRToISATimeInMs = 6 : i64}, "P\EDU\BA\01\00\10\00`\18\00\00\00\00\00\00\02\00\01\01h\00\00\00@\16\00\00\00\00\00\00\00\00\00\00@\00\00\00\08\00\01\00x\00\00\00\00\00\00\00\00\00\00\00\11\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00H\00\00\00\1C\00\00\00\00\02\09\00\00\02\02\01\00\03\07\01\01\02\03\00\00\04\0B\08\00P\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\01A\08\00\00\00\00\00\00\00\02\00\BE\00\01\00\00\00\00\00\00\00\00\00\00\00(\15\00\00\00\00\00\00\A8\0F\00\00\00\00\00\00\02x\00\06@\008\00\05\00@\00\16\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.note.nv.tkinfo\00.note.nv.cuinfo\00.nv.info\00.nv.compat\00.text.kernel\00.nv.info.kernel\00.nv.shared.kernel\00.nv.shared.reserved.0\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.constant0.kernel\00.nv.capmerc.text.kernel\00.nv.merc.debug_frame\00.nv.merc.nv.info\00.nv.merc.nv.info.kernel\00.nv.merc.rela.debug_frame\00.nv.merc.nv.shared.reserved.0\00.nv.merc.symtab\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.note.nv.tkinfo\00.note.nv.cuinfo\00.nv.info\00.nv.compat\00.text.kernel\00.nv.info.kernel\00.nv.shared.kernel\00.nv.reservedSmem.offset0\00.nv.shared.reserved.0\00__nv_reservedSMEM_offset_0_alias\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00kernel\00.nv.constant0.kernel\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\03\00\05\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\009\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00]\00\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\8C\00\00\00!\00\00\00@\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\BB\00\00\00 \A0\0D\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\00\00\00\03\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0C\01\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00(\01\00\00\12\10\0C\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00/\01\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\FF\FF\FF\FF$\00\00\00\00\00\00\00\FF\FF\FF\FF\FF\FF\FF\FF\03\00\04|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF,\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\04\18\00\00\00\0C\81\80\80(\00\04 \00\00\00\00\00\00\00\0C\00\00\00\8C\00\00\00\D0\07\00\00NVIDIA Corp\00\02\00\00\00\00\00\00\00\01\00\00\00\07\00\00\006\00\00\00`\00\00\00\00ptxas\00Cuda compilation tools, release 13.1, V13.1.80\00Build cuda_13.1.r13.1/compiler.36836380_0\00-O 3 -arch sm_120 \00\00\0C\00\00\00\08\00\00\00\E8\03\00\00NVIDIA Corp\00\02\00x\00\83\00\00\00\04/\08\00\08\00\00\00\0A\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\02\09\00\00\02\02\01\00\03\07\01\01\02\03\00\00\04\0B\08\00P\00\00\00\00\00\00\00\047\04\00\83\00\00\00\04\17\0C\00\00\00\00\00\0A\00P\00\00\F0!\00\04\17\0C\00\00\00\00\00\09\00H\00\00\F0!\00\04\17\0C\00\00\00\00\00\08\00@\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\008\00\00\F4!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F4!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F4!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F4!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03P\00\00\03\1B\FF\00\03_\01\01\02J\00\00\04\1C\08\00P\00\00\00\E0\00\00\00\03\19X\00\04\0A\08\00\09\00\00\00\80\03X\00\046\04\00\00\00\00\00\00\00\00\00\FF\FF\FF\FF\00\00\00\00\FE\FF\FF\FF\00\00\00\00\FD\FF\FF\FF\00\00\00\00\FC\FF\FF\FFD\00\00\00\00\00\00\00\02\00\00\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\82{\01\FF\00\DF\00\00\00\08\00\00\00\E2\0F\00\19y\02\00\00\00\00\00\00!\00\00\00\22\0E\00\ACw\04\FF\00p\00\00\00\0A\00\08\00\22\0E\001t\03\FF\00\00\00\00\FF\01\00\00\00\CA\0F\00\0C|\00\02\04\00\00\00pb\F1\0B\00\DC\1F\00M\09\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\82{\04\FF\00\E4\00\00\00\0A\00\00\00\22\0E\00\ACw\04\FF\00k\00\00\00\0A\00\08\00n\0E\00\82{\06\FF\00\EE\00\00\00\0A\00\00\00\A2\0E\00%x\04\02\04\00\00\00\04\00\8E\07\00\CC\1F\00\81y\04\04\04\00\00\00\00\19\1E\0C\00\E2.\00%x\02\02\04\00\00\00\06\00\8E\07\00\C8O\00!r\07\04\04\00\00\00\00\00\00\00\00\CA\8F\00\86y\00\02\07\00\00\00\04\19\10\0C\00\E2\0F\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\FC\00\FC\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0C\00\00\00\01\00\00\C0\10\00\00\00(^\00\00\01\0B\04\0A\F8\00\04\00\00\00A\00\00\04\00\00\01\0B\04\0A\F8\00\04\00\00\00\81\00\01\02\00\00\02\22\08\06\FA\00R\00\00\00\03\01@\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00A\10v\0A\02\22\0E\06\F8\00R\00\00\00\03\01@\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\10\00\00\00\01\0B\0E\0A\FA\00\05\00\00\00\03\019\04\00\00\02\22\0E\06\F8\00R\00\00\00\83\01@\00\02\00\00\00\00\00\00\00\00\00\00\00\00\008\00\00\00\028\0E2\F8\00@\11\00\00\00\00\82\00\0A\00\00\02\01\C0\01\00\00\00\00\00\00\00\00\00\00\00\D0\04\FF\FF\FF\FF$\00\00\00\00\00\00\00\FF\FF\FF\FF\FF\FF\FF\FF\03\00\01|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00p\01\00\00\00\00\00\00\04\10\00\00\00\04p\00\00\00\0C\81\80\80(\00\04\E0\00\00\00\00\00\00\00\00\00\00\00\00\04/\08\00\08\00\00\00\0A\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\047\04\00\83\00\00\00\04Z \00\8A\9D\22\A4\B1\9D\14m\00\B4*\F3\F7X\03\A5',!0\C9\1E\C7\8F\0F\0CIl\0A/\00\00\04\17\0C\00\00\00\00\00\0A\00P\00\00\F0!\00\04\17\0C\00\00\00\00\00\09\00H\00\00\F0!\00\04\17\0C\00\00\00\00\00\08\00@\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\008\00\00\F4!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F4!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F4!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F4!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03P\00\00\03\1B\FF\00\03_\01\01\02J\00\00\04\1C\08\00p\00\00\00`\01\00\00\00\00\00\00D\00\00\00\00\00\00\00=\00\01\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\03\00\05\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\009\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00]\00\00\00\03\00\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\8C\00\00\00!\00\00\00\00\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\BB\00\00\00 \A0\14\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\00\00\00\03\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0C\01\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00(\01\00\00\12\10\0F\00\00\00\00\00\00\00\00\00p\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\00\00\00\00\A1\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0B\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\05\02\00\00\00\00\00\00D\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\13\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00P\03\00\00\00\00\00\00\F0\00\00\00\00\00\00\00\02\00\00\00\0A\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\A2\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\04\00\00\00\00\00\00h\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\07\00\00\00\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\A8\04\00\00\00\00\00\00\A4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\009\00\00\00\07\00\00\00@\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00L\05\00\00\00\00\00\00 \00\00\00\00\00\00\00\05\00\00\00\08\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00I\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00l\05\00\00\00\00\00\00$\00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00R\00\00\00\86\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\90\05\00\00\00\00\00\00\1C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00j\00\00\00\00\00\00p@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AC\05\00\00\00\00\00\00\EC\00\00\00\00\00\00\00\03\00\00\00\0C\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D2\00\00\00\01\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\98\06\00\00\00\00\00\00 \00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\C0\00\00\00\04\00\00\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B8\06\00\00\00\00\00\00\18\00\00\00\00\00\00\00\03\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00]\00\00\00\01\00\00\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\00\00\00\00\00\80\01\00\00\00\00\00\00\03\00\00\00\08\00\00\00\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\8C\00\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\08\00\00\00\00\00\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EE\00\00\00\01\00\00\00B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\08\00\00\00\00\00\00\D8\03\00\00\00\00\00\00\00\00\00\00\0C\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\01\00\00\16\00\00p\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00`\0C\00\00\00\00\00\00\C6\00\00\00\00\00\00\00\15\00\00\00\08\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\1B\01\00\00\01\00\00\00\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00&\0D\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\01\00\00\83\00\00p\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\98\0D\00\00\00\00\00\00$\00\00\00\00\00\00\00\15\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00A\01\00\00\83\00\00p@\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\BC\0D\00\00\00\00\00\00\F8\00\00\00\00\00\00\00\15\00\00\00\0F\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00Y\01\00\00\82\00\00p@\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\B8\0E\00\00\00\00\00\00\18\00\00\00\00\00\00\00\15\00\00\00\10\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00s\01\00\00\15\00\00p\03\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\D0\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\91\01\00\00\85\00\00p\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\D0\0E\00\00\00\00\00\00\D8\00\00\00\00\00\00\00\02\00\00\00\08\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\06\00\00\00\04\00\00\00(\15\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\18\01\00\00\00\00\00\00\18\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\04\00\00\00(\15\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\18\01\00\00\00\00\00\00\18\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\80\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\06\00\00\00\80\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\04\00\00\00\80\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\03\00\00\00\00\00\00\D8\03\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\01\01P\00\00\00h\01\00\00\00\00\00\00d\01\00\00@\00\00\00\07\00\08\00x\00\00\00\00\00\00\00\00\00\00\00\11\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\91\03\00\00\00\00\00\00H\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00(\B5/\FD`\91\02\D5\0A\00\A6\D18\22Pk\F3\D86\03\EC\F9E\03\AF-\B3\B4\09\B9I\0F\E0(\8C\15\AD\B6Fm\EC\A1\C4\A3j0RG-\00/\00/\00\BD\02\D2\A3\F7\05v\F8\1Ay\B3\CB'\D2\C8\1A\F6\87H\1D\A5\FE\10\BE^e)\E7\A0\\\1D\FAW{\9DNp\F4\88\F4=\D2\BB\BB\01ky\DAb)f\C2\D0\7F6C\FE\BD\10;\B0\8A\22H8\0D\C3\96\A5\14GP\94\AFY\BE\E7!\A1\E6\AF\B3\93\E2\1Ck \CD\8E\F7o\06\EF\1FV31\D86i\BB\ECY\9DnK\0C\D6H\C2\B6\86\86\B1&\9C\14\A4#\0F\C7\B6\86\82\C0i\81r\B7-\12\0A\F8\09\B3\86)>\E1\00\1B\B1\DB\AA\FF\16l[\1A\AA\D2mK\A1\A3G\E0\B4\A0dV\E3\C0\87\F0mk8\0E\E3\D48\8C\0C-!\8Bi\CB)\0C\86s\07\01\1C4 \A0Bt\E6\01wj\F8\81\B3A\C0\B3~\B6\08'`yIJ\824,\19\A5\0C\9E\CB\98\18.iw\BC\CC\B2\D1*\0F;\84\0D\B9S\131o\D7\03\C2\166Wf\8B]l\1ElzLqp^Z\8F\9AR\D0\9E\90Cy\02l\0CEoN\91\0DCy\867P\C8\AC\8B\C4\04\D1\09\19\A1|\80\F2Y\C4V\E4\FB\9C\AB^\913?\98\03\83\02\00\00\00\00">]
+  llvm.func @main() {
+    %0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+    %1 = llvm.mlir.zero : !llvm.ptr
+    %2 = llvm.mlir.constant(6 : index) : i64
+    %3 = llvm.mlir.constant(0 : index) : i64
+    %4 = llvm.mlir.constant(1.000000e+00 : f32) : f32
+    %5 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+    %6 = llvm.mlir.constant(3.000000e+00 : f32) : f32
+    %7 = llvm.mlir.constant(4.000000e+00 : f32) : f32
+    %8 = llvm.mlir.constant(5.000000e+00 : f32) : f32
+    %9 = llvm.mlir.constant(6.000000e+00 : f32) : f32
+    %10 = llvm.mlir.constant(1 : index) : i64
+    %11 = llvm.mlir.constant(2 : index) : i64
+    %12 = llvm.getelementptr %1[6] : (!llvm.ptr) -> !llvm.ptr, f32
+    %13 = llvm.ptrtoint %12 : !llvm.ptr to i64
+    %14 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr
+    %15 = llvm.insertvalue %14, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %16 = llvm.insertvalue %14, %15[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %17 = llvm.insertvalue %3, %16[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %18 = llvm.insertvalue %2, %17[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %19 = llvm.insertvalue %10, %18[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
+    %20 = builtin.unrealized_conversion_cast %19 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<6xf32>
+    llvm.store %4, %14 : f32, !llvm.ptr
+    %21 = llvm.getelementptr inbounds|nuw %14[1] : (!llvm.ptr) -> !llvm.ptr, f32
+    llvm.store %5, %21 : f32, !llvm.ptr
+    %22 = llvm.getelementptr inbounds|nuw %14[2] : (!llvm.ptr) -> !llvm.ptr, f32
+    llvm.store %6, %22 : f32, !llvm.ptr
+    %23 = llvm.getelementptr inbounds|nuw %14[3] : (!llvm.ptr) -> !llvm.ptr, f32
+    llvm.store %7, %23 : f32, !llvm.ptr
+    %24 = llvm.getelementptr inbounds|nuw %14[4] : (!llvm.ptr) -> !llvm.ptr, f32
+    llvm.store %8, %24 : f32, !llvm.ptr
+    %25 = llvm.getelementptr inbounds|nuw %14[5] : (!llvm.ptr) -> !llvm.ptr, f32
+    llvm.store %9, %25 : f32, !llvm.ptr
+    %memref = gpu.alloc  () : memref<6xf32, 1>
+    %26 = builtin.unrealized_conversion_cast %memref : memref<6xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
+    %memref_0 = gpu.alloc  () : memref<6xf32, 1>
+    %27 = builtin.unrealized_conversion_cast %memref_0 : memref<6xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)>
+    gpu.memcpy  %memref, %20 : memref<6xf32, 1>, memref<6xf32>
+    %28 = llvm.extractvalue %26[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %29 = llvm.extractvalue %26[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %30 = llvm.extractvalue %26[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %31 = llvm.extractvalue %26[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %32 = llvm.extractvalue %26[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %33 = llvm.extractvalue %27[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %34 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %35 = llvm.extractvalue %27[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %36 = llvm.extractvalue %27[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    %37 = llvm.extractvalue %27[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> 
+    gpu.launch_func  @kernels::@kernel blocks in (%10, %10, %10) threads in (%11, %10, %10) : i64 args(%11 : i64, %28 : !llvm.ptr<1>, %29 : !llvm.ptr<1>, %30 : i64, %31 : i64, %32 : i64, %33 : !llvm.ptr<1>, %34 : !llvm.ptr<1>, %35 : i64, %36 : i64, %37 : i64)
+    gpu.dealloc  %memref : memref<6xf32, 1>
+    gpu.dealloc  %memref_0 : memref<6xf32, 1>
+    llvm.call @free(%14) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+}
+
diff --git a/mlir/cuda-tile/explore/extern_fun.mlir b/mlir/cuda-tile/explore/extern_fun.mlir
new file mode 100644
index 0000000..0871803
--- /dev/null
+++ b/mlir/cuda-tile/explore/extern_fun.mlir
@@ -0,0 +1,113 @@
+module {
+  // libc
+  func.func private @malloc(i64) -> memref<*xi8>
+  func.func private @free(memref<*xi8>)
+
+  // 轻量包装：仅用整数/布尔/opaque memref，避免 llvm.ptr 类型
+  func.func private @shimMemAlloc(i64) -> i64
+  func.func private @shimMemFree(i64)
+  func.func private @shimMemcpyHtoD(i64, memref<6xf32>)
+  func.func private @shimMemcpyDtoH(memref<6xf32>, i64)
+  func.func private @shimCtxSynchronize()
+
+  func.func @main() {
+    %size_bytes = arith.constant 24 : i64                 // 6 * sizeof(f32)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %c4 = arith.constant 4 : index
+    %c5 = arith.constant 5 : index
+    %f1 = arith.constant 1.0 : f32
+    %f2 = arith.constant 2.0 : f32
+    %f3 = arith.constant 3.0 : f32
+    %f4 = arith.constant 4.0 : f32
+    %f5 = arith.constant 5.0 : f32
+    %f6 = arith.constant 6.0 : f32
+
+    // host buffer as memref
+    %h = memref.alloc() : memref<6xf32>
+    memref.store %f1, %h[%c0] : memref<6xf32>
+    memref.store %f2, %h[%c1] : memref<6xf32>
+    memref.store %f3, %h[%c2] : memref<6xf32>
+    memref.store %f4, %h[%c3] : memref<6xf32>
+    memref.store %f5, %h[%c4] : memref<6xf32>
+    memref.store %f6, %h[%c5] : memref<6xf32>
+
+    // device alloc handle (as i64 pointer-sized integer)
+    %d = func.call @shimMemAlloc(%size_bytes) : (i64) -> i64
+
+    func.call @shimMemcpyHtoD(%d, %h) : (i64, memref<6xf32>) -> ()
+    func.call @shimCtxSynchronize() : () -> ()
+    func.call @shimMemcpyDtoH(%h, %d) : (memref<6xf32>, i64) -> ()
+    func.call @shimCtxSynchronize() : () -> ()
+
+    func.call @shimMemFree(%d) : (i64) -> ()
+    memref.dealloc %h : memref<6xf32>
+    func.return
+  }
+}
+
+// module {
+//   // libc
+//   llvm.func @malloc(i64) -> !llvm.ptr
+//   llvm.func @free(!llvm.ptr)
+
+//   // cuda_shim C 接口（来自 cuda_shim.cpp）
+//   llvm.func @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr
+//   llvm.func @mgpuMemFree(!llvm.ptr, !llvm.ptr)
+//   llvm.func @mgpuMemcpyHtoD(!llvm.ptr, !llvm.ptr, i64)
+//   llvm.func @mgpuMemcpyDtoH(!llvm.ptr, !llvm.ptr, i64)
+//   llvm.func @mgpuCtxSynchronize()
+
+//   llvm.func @main() {
+//     %size = llvm.mlir.constant(24 : i64) : i64          // 6 * sizeof(f32)
+//     %zero_ptr = llvm.mlir.zero : !llvm.ptr              // 空 stream
+//     %false = llvm.mlir.constant(false) : i1
+
+//     // host buffer
+//     %h = llvm.call @malloc(%size) : (i64) -> !llvm.ptr
+
+//     // 写入 1..6 到 host
+//     %c0 = llvm.mlir.constant(0 : index) : i64
+//     %c1 = llvm.mlir.constant(1 : index) : i64
+//     %c2 = llvm.mlir.constant(2 : index) : i64
+//     %c3 = llvm.mlir.constant(3 : index) : i64
+//     %c4 = llvm.mlir.constant(4 : index) : i64
+//     %c5 = llvm.mlir.constant(5 : index) : i64
+//     %f1 = llvm.mlir.constant(1.0 : f32) : f32
+//     %f2 = llvm.mlir.constant(2.0 : f32) : f32
+//     %f3 = llvm.mlir.constant(3.0 : f32) : f32
+//     %f4 = llvm.mlir.constant(4.0 : f32) : f32
+//     %f5 = llvm.mlir.constant(5.0 : f32) : f32
+//     %f6 = llvm.mlir.constant(6.0 : f32) : f32
+
+//     %p0 = llvm.getelementptr %h[%c0] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+//     llvm.store %f1, %p0 : f32, !llvm.ptr
+//     %p1 = llvm.getelementptr %h[%c1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+//     llvm.store %f2, %p1 : f32, !llvm.ptr
+//     %p2 = llvm.getelementptr %h[%c2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+//     llvm.store %f3, %p2 : f32, !llvm.ptr
+//     %p3 = llvm.getelementptr %h[%c3] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+//     llvm.store %f4, %p3 : f32, !llvm.ptr
+//     %p4 = llvm.getelementptr %h[%c4] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+//     llvm.store %f5, %p4 : f32, !llvm.ptr
+//     %p5 = llvm.getelementptr %h[%c5] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+//     llvm.store %f6, %p5 : f32, !llvm.ptr
+
+//     // device alloc (isHostShared = false)
+//     %d = llvm.call @mgpuMemAlloc(%size, %zero_ptr, %false)
+//         : (i64, !llvm.ptr, i1) -> !llvm.ptr
+
+//     // HtoD then DtoH (round-trip)
+//     llvm.call @mgpuMemcpyHtoD(%d, %h, %size) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+//     llvm.call @mgpuCtxSynchronize() : () -> ()
+//     llvm.call @mgpuMemcpyDtoH(%h, %d, %size) : (!llvm.ptr, !llvm.ptr, i64) -> ()
+//     llvm.call @mgpuCtxSynchronize() : () -> ()
+
+//     // free
+//     llvm.call @mgpuMemFree(%d, %zero_ptr) : (!llvm.ptr, !llvm.ptr) -> ()
+//     llvm.call @free(%h) : (!llvm.ptr) -> ()
+//     llvm.return
+//   }
+// }
\ No newline at end of file
diff --git a/mlir/cuda-tile/explore/gpu.mlir b/mlir/cuda-tile/explore/gpu.mlir
new file mode 100644
index 0000000..bcef914
--- /dev/null
+++ b/mlir/cuda-tile/explore/gpu.mlir
@@ -0,0 +1,99 @@
+// module attributes {gpu.container_module} {
+//   // ---- Device side (GPU) ----
+//   gpu.module @kernels {
+//     gpu.func @kernel(%n : index, %A : memref<?xf32, 1>, %B : memref<?xf32, 1>)
+//         attributes { gpu.kernel } {
+//       %tid = gpu.thread_id x
+//       %pred = arith.cmpi slt, %tid, %n : index
+//       scf.if %pred {
+//         %a = memref.load %A[%tid] : memref<?xf32, 1>
+//         memref.store %a, %B[%tid] : memref<?xf32, 1>
+//       }
+//       gpu.return
+//     }
+//   }
+
+//   // ---- Host side (CPU) ----
+//   func.func @main(%n : index, %hA : memref<?xf32>, %hB : memref<?xf32>) {
+//     %dA = gpu.alloc(%n) : memref<?xf32, 1>
+//     %dB = gpu.alloc(%n) : memref<?xf32, 1>
+//     gpu.memcpy %dA, %hA : memref<?xf32, 1>, memref<?xf32>
+//     // launch kernel（blocks/threads 这里先写死成 1D）
+//     %c1 = arith.constant 1 : index
+//     gpu.launch_func @kernels::@kernel
+//       blocks in (%c1, %c1, %c1) threads in (%n, %c1, %c1)
+//       args(%n : index, %dA : memref<?xf32, 1>, %dB : memref<?xf32, 1>)
+
+//     gpu.memcpy %hB, %dB : memref<?xf32>, memref<?xf32, 1>
+//     gpu.dealloc %dA : memref<?xf32, 1>
+//     gpu.dealloc %dB : memref<?xf32, 1>
+//     return
+//   }
+// }
+
+module attributes {gpu.container_module} {
+
+  gpu.module @kernels {
+    gpu.func @kernel(%n : index, %A : memref<6xf32, 1>, %B : memref<6xf32, 1>)
+        attributes { gpu.kernel } {
+      %tid = gpu.thread_id x
+      %pred = arith.cmpi slt, %tid, %n : index
+      scf.if %pred {
+        %a = memref.load %A[%tid] : memref<6xf32, 1>
+        %b = arith.addf %a, %a : f32
+        memref.store %b, %B[%tid] : memref<6xf32, 1>
+      }
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %cst = arith.constant 6.000000e+00 : f32
+    %cst_0 = arith.constant 5.000000e+00 : f32
+    %cst_1 = arith.constant 4.000000e+00 : f32
+    %cst_2 = arith.constant 3.000000e+00 : f32
+    %cst_3 = arith.constant 2.000000e+00 : f32
+    %cst_4 = arith.constant 1.000000e+00 : f32
+
+    %0 = memref.alloc() : memref<6xf32>
+    %1 = memref.alloc() : memref<6xf32>
+
+    affine.store %cst_4, %1[0] : memref<6xf32>
+    affine.store %cst_3, %1[1] : memref<6xf32>
+    affine.store %cst_2, %1[2] : memref<6xf32>
+    affine.store %cst_1, %1[3] : memref<6xf32>
+    affine.store %cst_0, %1[4] : memref<6xf32>
+    affine.store %cst,   %1[5] : memref<6xf32>
+
+    %n = arith.constant 2 : index
+
+    %dA = gpu.alloc() : memref<6xf32, 1>
+    %dB = gpu.alloc() : memref<6xf32, 1>
+    gpu.memcpy %dA, %1 : memref<6xf32, 1>, memref<6xf32>
+
+    // launch kernel（blocks/threads 这里先写死成 1D）
+    gpu.launch_func @kernels::@kernel
+      blocks in (%c1, %c1, %c1) threads in (%n, %c1, %c1)
+      args(%n : index, %dA : memref<6xf32, 1>, %dB : memref<6xf32, 1>)
+
+    gpu.memcpy %0, %dB : memref<6xf32>, memref<6xf32, 1>
+    gpu.dealloc %dA : memref<6xf32, 1>
+    gpu.dealloc %dB : memref<6xf32, 1>
+    memref.dealloc %1 : memref<6xf32>
+    memref.dealloc %0 : memref<6xf32>
+    return
+  }
+}
+// func.func @main() {
+//     %c2 = arith.constant 2 : index
+//     %c1 = arith.constant 1 : index
+//     gpu.launch
+//         blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
+//         threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
+//         gpu.printf "Hello from %d\n", %6 : index
+//         gpu.terminator
+//     }
+//     return
+// }
\ No newline at end of file
diff --git a/mlir/cuda-tile/explore/outlined.mlir b/mlir/cuda-tile/explore/outlined.mlir
new file mode 100644
index 0000000..714eddf
--- /dev/null
+++ b/mlir/cuda-tile/explore/outlined.mlir
@@ -0,0 +1,22 @@
+module {
+  toy.func @main() {
+    %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
+    %1 = toy.constant dense<[[1.100000e+01, 1.200000e+01, 1.300000e+01], [1.400000e+01, 1.500000e+01, 1.600000e+01]]> : tensor<2x3xf32>
+    %2 = toy.launch_gpu @outlined_gpu_kernel_0(%1, %0) {grid = array<i64: 4, 2, 1>} : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x2xf32>
+    toy.print %2 : tensor<2x2xf32>
+    %3 = toy.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00], [1.000000e+01, 1.100000e+01, 1.200000e+01]]> : tensor<2x3xf32>
+    %4 = toy.launch_gpu @outlined_gpu_kernel_1(%0, %3, %1) {grid = array<i64: 4, 2, 1>} : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+    toy.print %4 : tensor<2x3xf32>
+    toy.return
+  }
+  toy.gpu_func @outlined_gpu_kernel_0(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<2x2xf32> {
+    %0 = toy.transpose(%arg0 : tensor<2x3xf32>) to tensor<3x2xf32>
+    %1 = toy.matmul(%arg1 : tensor<2x3xf32>, %0 : tensor<3x2xf32>) to tensor<2x2xf32>
+    toy.return %1 : tensor<2x2xf32>
+  }
+  toy.gpu_func @outlined_gpu_kernel_1(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf32>) -> tensor<2x3xf32> {
+    %0 = toy.mul %arg0, %arg1 : tensor<2x3xf32>
+    %1 = toy.add %0, %arg2 : tensor<2x3xf32>
+    toy.return %1 : tensor<2x3xf32>
+  }
+}
diff --git a/mlir/cuda-tile/explore/run.sh b/mlir/cuda-tile/explore/run.sh
new file mode 100644
index 0000000..4caf90b
--- /dev/null
+++ b/mlir/cuda-tile/explore/run.sh
@@ -0,0 +1,121 @@
+export MLIR_RUNNER_UTILS=`pwd`/../third_party/llvm/lib/libmlir_runner_utils.so
+export MLIR_CUDA_RUNTIME=`pwd`/../third_party/llvm/lib/libmlir_cuda_runtime.so
+
+# Set this to your GPU arch, e.g. sm_120 for RTX 50xx (if your toolchain supports it).
+export CUDA_ARCH=${CUDA_ARCH:-sm_120}
+
+rm -rf example-nvvm.mlir example.ll
+
+
+../third_party/llvm/bin/mlir-opt gpu.mlir -cse \
+  -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_120 opt-level=3" \
+  --reconcile-unrealized-casts -cse -o example-nvvm.mlir
+
+
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   --pass-pipeline="builtin.module(
+#     nvvm-attach-target{chip=sm_80 O=3},
+#     gpu.module(convert-gpu-to-nvvm),
+#     gpu-module-to-binary,
+#     lower-host-to-llvm
+#   )" \
+#   -o example-nvvm.mlir
+
+
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 opt-level=3" \
+#   -reconcile-unrealized-casts \
+#   -canonicalize -cse \
+#   -o example-nvvm.mlir
+
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   --convert-scf-to-cf \
+#   --convert-index-to-llvm \
+#   --convert-arith-to-llvm \
+#   --finalize-memref-to-llvm \
+#   --convert-cf-to-llvm \
+#   --convert-func-to-llvm \
+#   --convert-to-llvm \
+#   --reconcile-unrealized-casts \
+#   -canonicalize -cse \
+#   -o example-nvvm.mlir
+
+
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_120 opt-level=3" \
+#   -reconcile-unrealized-casts \
+#   -o example-nvvm.mlir
+
+# --gpu-to-llvm="use-bare-pointers-for-kernels=1 intersperse-sizes-for-kernels=1"
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   --pass-pipeline="builtin.module(
+#     nvvm-attach-target{chip=sm_89 O=3},
+#     gpu.module(convert-gpu-to-nvvm),
+#     convert-scf-to-cf,
+#     convert-index-to-llvm,
+#     convert-arith-to-llvm,
+#     convert-math-to-llvm,
+#     convert-func-to-llvm,
+#     gpu-to-llvm,
+#     convert-cf-to-llvm,
+#     finalize-memref-to-llvm,
+#     gpu-module-to-binary,
+#     reconcile-unrealized-casts
+#   )" -o example-nvvm.mlir
+
+
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   --gpu-to-llvm
+
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   --pass-pipeline="builtin.module(
+#     gpu-kernel-outlining,
+#     nvvm-attach-target{chip=sm_80 O=3},
+#     gpu.module(convert-gpu-to-nvvm),
+#     gpu-module-to-binary,
+#     convert-scf-to-cf,
+#     convert-cf-to-llvm,
+#     lower-host-to-llvm,
+#     reconcile-unrealized-casts
+#   )" \
+#   -o example-nvvm.mlir
+
+# ../third_party/llvm/bin/mlir-opt gpu.mlir \
+#   --pass-pipeline="builtin.module(
+#     gpu-kernel-outlining,
+#     nvvm-attach-target{chip=sm_80 O=3},
+#     gpu.module(convert-gpu-to-nvvm),
+#     gpu-module-to-binary,
+
+#     gpu-to-llvm,
+
+#     convert-scf-to-cf,
+#     convert-index-to-llvm,
+#     convert-arith-to-llvm,
+#     convert-memref-to-llvm,
+#     finalize-memref-to-llvm,
+#     convert-cf-to-llvm,
+#     convert-func-to-llvm,
+
+#     reconcile-unrealized-casts
+#   )" \
+#   -o example-nvvm.mlir
+
+
+../third_party/llvm/bin/mlir-translate example-nvvm.mlir        \
+  --mlir-to-llvmir                      \
+  -o example.ll
+
+  # -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=${CUDA_ARCH}"
+
+  # | ../third_party/llvm/bin/mlir-runner \
+  #     --shared-libs=$MLIR_CUDA_RUNTIME \
+  #     --shared-libs=$MLIR_RUNNER_UTILS \
+  #     --entry-point=_mlir_ciface_main \
+  #     --entry-point-result=void
+
+# ../third_party/llvm/bin/mlir-runner example-nvvm.mlir \
+#   --entry-point-result=void \
+#   --shared-libs=${MLIR_RUNNER_UTILS} \
+#   --shared-libs=${MLIR_CUDA_RUNTIME}
+
diff --git a/mlir/cuda-tile/sample/cuda-tile.mlir b/mlir/cuda-tile/sample/cuda-tile.mlir
new file mode 100644
index 0000000..72d424d
--- /dev/null
+++ b/mlir/cuda-tile/sample/cuda-tile.mlir
@@ -0,0 +1,32 @@
+module {
+  toy.func @main() {
+    %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00, 9.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00, 1.000000e+01]]> : tensor<2x4xf32>
+    %1 = toy.constant dense<[[1.100000e+01, 1.200000e+01, 1.300000e+01, 1.400000e+01], [1.500000e+01, 1.600000e+01, 1.700000e+01, 1.800000e+01]]> : tensor<2x4xf32>
+    %2 = toy.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00, 1.300000e+01], [1.000000e+01, 1.100000e+01, 1.200000e+01, 1.400000e+01]]> : tensor<2x4xf32>
+    %3 = toy.launch_gpu @outlined_gpu_kernel_0(%0, %2, %1) {grid = array<i64: 1, 1, 1>} : (tensor<2x4xf32>, tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32>
+    toy.print %3 : tensor<2x4xf32>
+    toy.return
+  }
+  cuda_tile.module @cuda_tile_module {
+    entry @outlined_gpu_kernel_0(%arg0: tile<ptr<f32>>, %arg1: tile<ptr<f32>>, %arg2: tile<ptr<f32>>, %arg3: tile<ptr<f32>>) {
+      %tview = make_tensor_view %arg0, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]>
+      %pview = make_partition_view %tview : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>
+      %cst_0_i32 = constant <i32: 0> : tile<i32>
+      %tile, %result_token = load_view_tko weak %pview[%cst_0_i32, %cst_0_i32] : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>, tile<i32> -> tile<2x4xf32>, token
+      %tview_0 = make_tensor_view %arg1, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]>
+      %pview_1 = make_partition_view %tview_0 : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>
+      %tile_2, %result_token_3 = load_view_tko weak %pview_1[%cst_0_i32, %cst_0_i32] : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>, tile<i32> -> tile<2x4xf32>, token
+      %0 = mulf %tile, %tile_2  : tile<2x4xf32>
+      %tview_4 = make_tensor_view %arg2, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]>
+      %pview_5 = make_partition_view %tview_4 : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>
+      %tile_6, %result_token_7 = load_view_tko weak %pview_5[%cst_0_i32, %cst_0_i32] : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>, tile<i32> -> tile<2x4xf32>, token
+      %tile_8, %result_token_9 = load_view_tko weak %pview_1[%cst_0_i32, %cst_0_i32] : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>, tile<i32> -> tile<2x4xf32>, token
+      %1 = mulf %tile_6, %tile_8  : tile<2x4xf32>
+      %2 = addf %0, %1  : tile<2x4xf32>
+      %tview_10 = make_tensor_view %arg3, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]>
+      %pview_11 = make_partition_view %tview_10 : partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>
+      %3 = store_view_tko weak %2, %pview_11[%cst_0_i32, %cst_0_i32] : tile<2x4xf32>, partition_view<tile=(2x4), tensor_view<2x4xf32, strides=[4,1]>>, tile<i32> -> token
+      return
+    }
+  }
+}
diff --git a/mlir/cuda-tile/sample/example.toy b/mlir/cuda-tile/sample/example.toy
new file mode 100644
index 0000000..724a23e
--- /dev/null
+++ b/mlir/cuda-tile/sample/example.toy
@@ -0,0 +1,13 @@
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+
+  # b is identical to a, the literal tensor is implicitly reshaped: defining new
+  # variables is the way to reshape tensors (element count must match).
+  var b<2, 3> = [1, 2, 3, 4, 5, 6];
+
+  # transpose() and print() are the only builtin, the following will transpose
+  # a and b and perform an element-wise multiplication before printing the result.
+  print(transpose(a) * transpose(b));
+}
diff --git a/mlir/cuda-tile/sample/gpu-func.mlir b/mlir/cuda-tile/sample/gpu-func.mlir
new file mode 100644
index 0000000..f5280bd
--- /dev/null
+++ b/mlir/cuda-tile/sample/gpu-func.mlir
@@ -0,0 +1,199 @@
+module {
+  // --- CUDA shim externs (ABI: all pointers/handles are i64) ---
+  func.func private @cuda_shim_load_module_from_image(i64, i64) -> i64
+  func.func private @cuda_shim_load_module_from_file(i64, i64) -> i64
+  func.func private @cuda_shim_unload_module(i64) -> ()
+  func.func private @cuda_shim_stream_create() -> i64
+  func.func private @cuda_shim_stream_destroy(i64) -> ()
+  func.func private @cuda_shim_stream_synchronize(i64) -> ()
+  func.func private @cuda_shim_malloc(i64, i64, i1) -> i64
+  func.func private @cuda_shim_free(i64, i64) -> ()
+  func.func private @cuda_shim_memcpy_h2d(i64, i64, i64) -> ()
+  func.func private @cuda_shim_memcpy_d2h(i64, i64, i64) -> ()
+  func.func private @cuda_shim_launch_packed(
+      i64, i64,
+      i32, i32, i32,
+      i32, i32, i32,
+      i32,
+      i64,
+      i64, i64,
+      i32) -> ()
+  func.func private @cuda_debug_dump_float(i64, i32) -> ()
+
+  // // --- GPU blob embedded (placeholder bytes for "cuda_tile.cubin") ---
+  // memref.global "private" constant @cuda_blob : memref<16xi8> = dense<
+  //   [99, 117, 100, 97, 95, 116, 105, 108, 101, 46, 99, 117, 98, 105, 110, 0]
+  // > : memref<16xi8>
+
+  // // --- Kernel name as a C string (NUL-terminated) ---
+  // // 注意：如果 driver 侧用 name 查找函数，这个字符串必须以 0 结尾。
+  // memref.global "private" constant @kname : memref<22xi8> = dense<[
+  //   111,117,116,108,105,110,101,100,95,103,112,117,95,107,101,114,110,101,108,95,48,0
+  // ]> : memref<22xi8>
+
+  memref.global "private" constant @cuda_blob : memref<16xi8> =
+  dense<"0x637564615f74696c652e637562696e00">
+
+  memref.global "private" constant @kname : memref<22xi8> =
+    dense<"0x6f75746c696e65645f6770755f6b65726e656c5f3000">
+
+  func.func @main() {
+    // ---------- Host buffers (after bufferization) ----------
+    %hA = memref.alloc() : memref<2x4xf32>
+    %hB = memref.alloc() : memref<2x4xf32>
+    %hOut = memref.alloc() : memref<2x4xf32>
+
+    // Fill constants (为了示例直接用 store 展开；真实 pipeline 通常会从 memref.global copy)
+    // A
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %c4 = arith.constant 4 : index
+    %c5 = arith.constant 5 : index
+    %c6 = arith.constant 6 : index
+    %c7 = arith.constant 7 : index
+
+    %cf1 = arith.constant 1.0 : f32
+    %cf2 = arith.constant 2.0 : f32
+    %cf3 = arith.constant 3.0 : f32
+    %cf9 = arith.constant 9.0 : f32
+
+    %cf4 = arith.constant 4.0 : f32
+    %cf5 = arith.constant 5.0 : f32
+    %cf6 = arith.constant 6.0 : f32
+    %cf10 = arith.constant 10.0 : f32
+
+    %cf11 = arith.constant 11.0 : f32
+    %cf12 = arith.constant 12.0 : f32
+    %cf13 = arith.constant 13.0 : f32
+    %cf14 = arith.constant 14.0 : f32
+    %cf15 = arith.constant 15.0 : f32
+    %cf16 = arith.constant 16.0 : f32
+    %cf17 = arith.constant 17.0 : f32
+    %cf18 = arith.constant 18.0 : f32
+
+    // row0
+    memref.store %cf1, %hA[%c0, %c0] : memref<2x4xf32>
+    memref.store %cf2, %hA[%c0, %c1] : memref<2x4xf32>
+    memref.store %cf3, %hA[%c0, %c2] : memref<2x4xf32>
+    memref.store %cf9, %hA[%c0, %c3] : memref<2x4xf32>
+
+    // row1
+    memref.store %cf4, %hA[%c1, %c0] : memref<2x4xf32>
+    memref.store %cf5, %hA[%c1, %c1] : memref<2x4xf32>
+    memref.store %cf6, %hA[%c1, %c2] : memref<2x4xf32>
+    memref.store %cf10, %hA[%c1, %c3] : memref<2x4xf32>
+
+    // B = %1 in your original (这里假设 %1 是第二个输入；你原 op 里是 (%0, %2, %1)，请按你 kernel 的真实语义对齐)
+    memref.store %cf11, %hB[%c0, %c0] : memref<2x4xf32>
+    memref.store %cf12, %hB[%c0, %c1] : memref<2x4xf32>
+    memref.store %cf13, %hB[%c0, %c2] : memref<2x4xf32>
+    memref.store %cf14, %hB[%c0, %c3] : memref<2x4xf32>
+    memref.store %cf15, %hB[%c1, %c0] : memref<2x4xf32>
+    memref.store %cf16, %hB[%c1, %c1] : memref<2x4xf32>
+    memref.store %cf17, %hB[%c1, %c2] : memref<2x4xf32>
+    memref.store %cf18, %hB[%c1, %c3] : memref<2x4xf32>
+
+    // ---------- Load module ----------
+    %blob = memref.get_global @cuda_blob : memref<16xi8>
+    %blob_ptr_idx = memref.extract_aligned_pointer_as_index %blob : memref<16xi8>  -> index
+    %blob_ptr_i64 = arith.index_cast %blob_ptr_idx : index to i64
+    %blobSize = arith.constant 16 : i64
+    %mod = func.call @cuda_shim_load_module_from_file(%blob_ptr_i64, %blobSize) : (i64, i64) -> i64
+
+    // kernel name pointer
+    %kn = memref.get_global @kname : memref<22xi8>
+    %kname_ptr_idx = memref.extract_aligned_pointer_as_index %kn : memref<22xi8>  -> index
+    %kname_ptr_i64 = arith.index_cast %kname_ptr_idx : index to i64
+
+    // ---------- Stream + device alloc ----------
+    %stream = func.call @cuda_shim_stream_create() : () -> i64
+    %isHostShared = arith.constant 0 : i1
+
+    %nElems = arith.constant 8 : i32
+    %nbytes = arith.constant 32 : i64  // 2*4*f32 = 8 * 4 = 32 bytes
+
+    %dA = func.call @cuda_shim_malloc(%nbytes, %stream, %isHostShared) : (i64, i64, i1) -> i64
+    %dB = func.call @cuda_shim_malloc(%nbytes, %stream, %isHostShared) : (i64, i64, i1) -> i64
+    %dOut = func.call @cuda_shim_malloc(%nbytes, %stream, %isHostShared) : (i64, i64, i1) -> i64
+
+    // host ptrs (as i64)
+    %hAptr = memref.extract_aligned_pointer_as_index %hA : memref<2x4xf32>  -> index
+    %hBptr = memref.extract_aligned_pointer_as_index %hB : memref<2x4xf32>  -> index
+    %hOutptr = memref.extract_aligned_pointer_as_index %hOut : memref<2x4xf32>  -> index
+
+    // host memrefs -> i64
+    %hA_ptr_i64 = arith.index_cast %hAptr : index to i64
+    %hB_ptr_i64 = arith.index_cast %hBptr : index to i64
+    %hOut_ptr_i64 = arith.index_cast %hOutptr : index to i64
+
+    func.call @cuda_shim_memcpy_h2d(%dA, %hA_ptr_i64, %nbytes) : (i64, i64, i64) -> ()
+    func.call @cuda_shim_memcpy_h2d(%dB, %hB_ptr_i64, %nbytes) : (i64, i64, i64) -> ()
+
+    // ---------- Build argSlots / argSizes (方案 A) ----------
+    // 这里 num_args=4： (A, B, Out, N)
+    // 注意：参数顺序必须和 @outlined_gpu_kernel_0 的 PTX param_0.. 一致
+    %numArgs = arith.constant 4 : index
+    %argSlots = memref.alloc() : memref<4xi64>
+    %argSizes = memref.alloc() : memref<4xi64>
+    %c8 = arith.constant 8 : i64
+    %ci4 = arith.constant 4 : i64
+
+    // num_args = 4
+    // i=0 a0
+    memref.store %c8, %argSizes[%c0] : memref<4xi64>
+    memref.store %dA, %argSlots[%c0] : memref<4xi64>
+
+    // i=1 a1
+    memref.store %c8, %argSizes[%c1] : memref<4xi64>
+    memref.store %dB, %argSlots[%c1] : memref<4xi64>
+
+    // i=2 a2   (你需要一个 dC，对应第三个输入)
+    memref.store %c8, %argSizes[%c2] : memref<4xi64>
+    memref.store %dB, %argSlots[%c2] : memref<4xi64>
+
+    // i=3 out
+    memref.store %c8, %argSizes[%c3] : memref<4xi64>
+    memref.store %dOut, %argSlots[%c3] : memref<4xi64>
+
+    // pointers to argSlots/argSizes (as i64)
+    %argSlotsptr = memref.extract_aligned_pointer_as_index %argSlots : memref<4xi64>  -> index
+    %argSlots_ptr_i64 = arith.index_cast %argSlotsptr : index to i64
+    %argSizesptr = memref.extract_aligned_pointer_as_index %argSizes : memref<4xi64>  -> index
+    %argSizes_ptr_i64 = arith.index_cast %argSizesptr : index to i64
+
+    // ---------- Launch ----------
+    %gridX = arith.constant 1 : i32
+    %gridY = arith.constant 1 : i32
+    %gridZ = arith.constant 1 : i32
+    %blockX = arith.constant 8 : i32
+    %blockY = arith.constant 1 : i32
+    %blockZ = arith.constant 1 : i32
+    %shmem = arith.constant 0 : i32
+    %numArgsI32 = arith.constant 4 : i32
+
+    func.call @cuda_shim_launch_packed(
+      %mod, %kname_ptr_i64,
+      %gridX, %gridY, %gridZ,
+      %blockX, %blockY, %blockZ,
+      %shmem, %stream,
+      %argSlots_ptr_i64, %argSizes_ptr_i64, %numArgsI32
+    ) : (i64, i64, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32) -> ()
+
+    func.call @cuda_shim_stream_synchronize(%stream) : (i64) -> ()
+    func.call @cuda_shim_memcpy_d2h(%hOut_ptr_i64, %dOut, %nbytes) : (i64, i64, i64) -> ()
+
+    %ci8 = arith.constant 8 : i32
+    func.call @cuda_debug_dump_float(%hOut_ptr_i64, %ci8) : (i64, i32) -> ()
+
+    // ---------- Cleanup ----------
+    func.call @cuda_shim_free(%dOut, %stream) : (i64, i64) -> ()
+    func.call @cuda_shim_free(%dA, %stream) : (i64, i64) -> ()
+    func.call @cuda_shim_free(%dB, %stream) : (i64, i64) -> ()
+    func.call @cuda_shim_stream_destroy(%stream) : (i64) -> ()
+    func.call @cuda_shim_unload_module(%mod) : (i64) -> ()
+
+    return
+  }
+}
diff --git a/mlir/cuda-tile/sample/gpu.mlir b/mlir/cuda-tile/sample/gpu.mlir
new file mode 100644
index 0000000..1149b17
--- /dev/null
+++ b/mlir/cuda-tile/sample/gpu.mlir
@@ -0,0 +1,13 @@
+toy.gpu_func @my_kernel(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> tensor<2x2xf32> {
+  %2 = toy.matmul(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3x2xf32>) to tensor<2x2xf32>
+  toy.return %2 : tensor<2x2xf32>
+}
+
+toy.func @main() {
+  %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32>
+  %3 = toy.constant dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf32>
+  %4 = toy.launch_gpu @my_kernel(%1, %3) {grid = [16, 16, 1]}
+        : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32>
+  toy.print %4 : tensor<2x2xf32>
+  toy.return
+}
\ No newline at end of file
diff --git a/mlir/cuda-tile/sample/lowering-llvm.sh b/mlir/cuda-tile/sample/lowering-llvm.sh
new file mode 100644
index 0000000..31275da
--- /dev/null
+++ b/mlir/cuda-tile/sample/lowering-llvm.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+./third_party/llvm/bin/mlir-opt sample/gpu-func.mlir \
+  -canonicalize -cse \
+  -convert-scf-to-cf \
+  -convert-arith-to-llvm \
+  -convert-math-to-llvm \
+  -finalize-memref-to-llvm \
+  -convert-func-to-llvm \
+  -reconcile-unrealized-casts \
+  -o lowered-llvm-dialect.mlir
+
+./third_party/llvm/bin/mlir-translate lowered-llvm-dialect.mlir --mlir-to-llvmir -o lowered.ll
+
+clang++ -O2 lowered.ll cuda_shim/cuda_shim.cc \
+  -I/usr/local/cuda/include \
+  -L/usr/lib/x86_64-linux-gnu \
+  -lcuda -ldl -lpthread -o cuda_shim/a.out
diff --git a/mlir/cuda-tile/sample/matmul.toy b/mlir/cuda-tile/sample/matmul.toy
new file mode 100644
index 0000000..a3a7406
--- /dev/null
+++ b/mlir/cuda-tile/sample/matmul.toy
@@ -0,0 +1,16 @@
+def main() {
+  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
+  # The shape is inferred from the supplied literal.
+  var a = [[1, 2, 3], [4, 5, 6]];
+
+  # b is identical to a, the literal tensor is implicitly reshaped: defining new
+  # variables is the way to reshape tensors (element count must match).
+  var b<2, 3> = [11, 12, 13, 14, 15, 16];
+
+  # transpose() and print() are the only builtin, the following will transpose
+  # a and b and perform an element-wise multiplication before printing the result.
+  # print(a * b + b);
+  print(matmul(a, transpose(b)));
+  var c<2, 3> = [[7, 8, 9], [10, 11, 12]];
+  print(a * c + b);
+}
diff --git a/mlir/cuda-tile/sample/matmul.toy.mlir b/mlir/cuda-tile/sample/matmul.toy.mlir
new file mode 100644
index 0000000..5a0cd7e
--- /dev/null
+++ b/mlir/cuda-tile/sample/matmul.toy.mlir
@@ -0,0 +1,16 @@
+toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
+  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
+  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
+  %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64>
+  toy.return %2 : tensor<*xf64>
+}
+
+toy.func @main() {
+  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
+  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
+  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
+  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64>
+  %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64>
+  toy.print %4 : tensor<*xf64>
+  toy.return
+}
diff --git a/mlir/cuda-tile/sample/validation.py b/mlir/cuda-tile/sample/validation.py
new file mode 100644
index 0000000..0dfaa8f
--- /dev/null
+++ b/mlir/cuda-tile/sample/validation.py
@@ -0,0 +1,68 @@
+import numpy as np
+
+a = np.array([2.925513671, 6.013753211, 3.436855551, 4.960371591,
+       4.681581191, 5.326281361, 5.998438671, 8.386758501,
+       9.752656342, 7.606886601, 8.787891511, 5.022086611,
+       5.778500111, 1.956275721, 3.159872631, 4.753185801,
+       4.316574111, 9.194122231, 2.881049471, 3.916830801,
+       3.263586051, 4.109766771, 5.528311631, 1.539308991,
+       5.302321521, 1.503789272, 2.398510571, 5.142972961,
+       1.797829151, 8.899659632, 3.695569372, 8.029364891,
+       1.884190321, 3.671349811, 9.706177531, 2.866185471,
+       1.073066151, 2.093449371, 3.606837071, 5.202953681,
+       6.973824941, 9.923411781, 4.807603021, 6.136584111,
+       9.463426811, 9.934010851, 9.836562501, 2.048052841,
+       8.976619421, 4.930650072, 5.020764221, 1.101745381,
+       6.731029581, 3.154495471, 2.699037381, 4.032752141,
+       1.474981521, 4.788043021, 7.608503651, 6.209790112,
+       5.128346451, 9.150372381, 3.007619561, 3.416143261,
+       1.826820941, 7.537852171, 5.316156761, 5.950802521,
+       1.518586911, 1.001132911, 2.004818211, 2.139243582,
+       9.806134011, 1.351495721, 4.017863671, 4.146281171,
+       1.592400301, 9.376376831, 1.980771871, 6.190763001,
+       8.168091671, 2.600631121, 2.321023891, 5.794502551,
+       7.501876551, 5.399619291, 2.130964981, 4.139401911,
+       5.462168121, 6.526603071, 4.334487231, 2.192541331,
+       3.127341681, 6.315157581, 5.794951491, 7.532202441,
+       8.384152712, 1.147242581, 1.996822871, 7.156064043,
+       7.863002391, 5.663554241, 4.847668712, 8.658924581,
+       7.971906902, 1.862331661, 2.062700981, 5.801351942,
+       9.898728741, 6.012050671, 3.713753551, 9.637241861,
+       8.075792551, 5.958895871, 9.693631161, 7.412511441,
+       2.318546031, 3.472298001, 6.612168271, 4.795551511,
+       7.590619801, 3.205424901, 4.464052241, 8.846670731,
+       8.922903701, 2.567556341, 2.973210511, 2.289122891,
+       1.104611521, 1.238846451, 4.810462721, 7.478858471,
+       6.446008071, 4.210157321, 9.251681432, 6.314803181,
+       7.613188801, 7.390693661, 9.741470081, 6.972811172,
+       6.071724461, 4.497292381, 7.952869721, 8.657016631,
+       1.621084282, 7.315808571, 7.997440491, 8.171035271,
+       1.666892991, 9.494730721, 8.183356871, 8.251520681,
+       3.532738031, 9.830194771, 3.670915731, 6.443815381,
+       5.592103051, 6.126956221, 1.812470391, 5.291468291,
+       9.935006871, 1.537468861, 4.185508861, 1.559862371,
+       6.173374171, 5.120825331, 9.526368891, 7.001111811,
+       2.550614511, 8.532620091, 6.003902941, 6.713298151,
+       9.577618433, 8.392393061, 7.069470191, 9.896941411,
+       9.897729712, 6.371231981, 6.560693141, 7.791094681,
+       1.016464861, 3.457840971, 4.575334621, 7.167314761,
+       2.742804381, 5.896408001, 6.789680541, 6.004492881,
+       1.284152471, 5.443332191, 4.528292101, 1.636873071,
+       5.688349201, 5.126366571, 2.749963801, 8.144933001,
+       9.489360822, 5.326660891, 7.052969211, 8.349262171,
+       4.898597061, 6.857033311, 6.328954281, 7.333406851,
+       2.588288651, 5.066354511, 9.736569781, 2.309512701,
+       5.342955001, 7.282803981, 4.335288321, 6.845356621,
+       4.755166231, 9.859116961, 2.263494621, 3.661889861,
+       7.990642211, 1.168159651, 4.607092851, 5.881411521,
+       5.437404021, 3.550808271, 6.298701342, 7.403695201,
+       8.327089262, 8.217354791, 2.330936821, 3.294671993,
+       1.249379871, 5.940774332, 7.761280541, 2.763552581,
+       6.695427572, 7.014880361, 1.449679711, 7.297992291,
+       1.819770801, 8.467591604, 6.751986351, 5.163341051,
+       2.313086431, 7.699937171, 4.172433101, 8.029572921,
+       8.581235361, 7.306871671, 7.148568021, 5.411956301,
+       2.408045811, 1.700581061, 5.210921901, 6.552318091,
+       3.749144091, 1.089704561, 3.245265091, 5.736190581]).reshape(16, 16)
+
+print(a @ a.T)
diff --git a/mlir/cuda-tile/scripts/apply_patch.sh b/mlir/cuda-tile/scripts/apply_patch.sh
new file mode 100644
index 0000000..bdc4b67
--- /dev/null
+++ b/mlir/cuda-tile/scripts/apply_patch.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+rm -rf Ch8
+cp -R Ch7 Ch8
+cd Ch8
+git apply ../scripts/patch/matmul.back.patch
diff --git a/mlir/cuda-tile/scripts/build_cuda_tile.sh b/mlir/cuda-tile/scripts/build_cuda_tile.sh
new file mode 100644
index 0000000..c74c728
--- /dev/null
+++ b/mlir/cuda-tile/scripts/build_cuda_tile.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [[ -f "/usr/bin/git" ]]; then
+  WORKSPACEROOT=$(git rev-parse --show-toplevel)/mlir/cuda-tile || WORKSPACEROOT=`pwd`
+fi
+
+echo "Building cuda-tile IR in ${WORKSPACEROOT}/third_party/cuda-tile"
+
+cd ${WORKSPACEROOT}/third_party/cuda-tile
+
+git checkout -q -- .
+
+rm -rf build
+
+cmake -G Ninja -S ${WORKSPACEROOT}/third_party/cuda-tile -B build \
+  -DCMAKE_BUILD_TYPE=Debug \
+  -DLLVM_ENABLE_ASSERTIONS=OFF \
+  -DCUDA_TILE_ENABLE_BINDINGS_PYTHON=OFF \
+  -DCUDA_TILE_ENABLE_TESTING=OFF \
+  -DCMAKE_INSTALL_PREFIX=${WORKSPACEROOT}/third_party/cuda \
+  -DCUDA_TILE_USE_LLVM_INSTALL_DIR=${WORKSPACEROOT}/third_party/llvm
+
+cmake --build build 
+
+cd build
+cmake --install .
diff --git a/mlir/cuda-tile/scripts/build_deps.sh b/mlir/cuda-tile/scripts/build_deps.sh
new file mode 100644
index 0000000..ef5314e
--- /dev/null
+++ b/mlir/cuda-tile/scripts/build_deps.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+#if [[ $# -ne 2 ]] ; then
+#  echo "Usage: $0 <path/to/llvm> <build_dir>"
+#  exit 1
+#fi
+
+if [[ -f "/usr/bin/git" ]]; then
+  WORKSPACEROOT=$(git rev-parse --show-toplevel)/mlir/cuda-tile || WORKSPACEROOT=`pwd`
+fi
+
+cd ${WORKSPACEROOT}
+
+# LLVM source
+LLVM_SRC_DIR="${1:-${WORKSPACEROOT}/third_party/llvm-project}"
+build_dir="${LLVM_SRC_DIR}/build"
+install_dir="${2:-${WORKSPACEROOT}/third_party/llvm}"
+
+if ! [ -f "$LLVM_SRC_DIR/llvm/CMakeLists.txt" ]; then
+  echo "Expected the path to LLVM to be set correctly (got '$LLVM_SRC_DIR'): can't find CMakeLists.txt"
+  exit 1
+fi
+echo "Using LLVM source dir: $LLVM_SRC_DIR"
+
+# Setup directories.
+echo "Building MLIR in $build_dir"
+rm -rf "$build_dir"
+mkdir -p "$build_dir"
+
+echo "Installing MLIR in $install_dir"
+rm -rf ${install_dir}
+mkdir -p ${install_dir}
+
+echo "Beginning build (commands will echo)"
+set -x
+
+cd $LLVM_SRC_DIR
+
+cmake -GNinja \
+  "-H llvm" \
+  "-B $build_dir" \
+  -DCMAKE_BUILD_TYPE=Debug \
+  -DLLVM_ENABLE_PROJECTS=mlir \
+  -DLLVM_TARGETS_TO_BUILD="X86;NVPTX;AMDGPU" \
+  -DLLVM_ENABLE_LLD=OFF \
+  -DLLVM_ENABLE_BACKTRACES=OFF \
+  -DLLVM_INCLUDE_UTILS=ON \
+  -DCMAKE_INSTALL_PREFIX=${install_dir} \
+  -DLLVM_INSTALL_UTILS=ON \
+  -DLLVM_BUILD_UTILS=ON \
+  -DLLVM_INCLUDE_TOOLS=ON \
+  -DLLVM_BUILD_TOOLS=ON \
+  -DLLVM_BUILD_LLVM_DYLIB=ON \
+  -DCMAKE_CXX_COMPILER=clang++ \
+  -DMLIR_ENABLE_CUDA_RUNNER=ON \
+  -DCMAKE_C_COMPILER=clang \
+  -DLLVM_LINK_LLVM_DYLIB=ON
+
+  # -DLLVM_ENABLE_RTTI=ON \
+  # -DLLVM_ENABLE_LIBEDIT=OFF \
+  # -DLLVM_ENABLE_BINDINGS=OFF \
+  # -DLLVM_INCLUDE_DOCS=OFF \
+  # -DLLVM_INCLUDE_TESTS=ON \
+  # -DLLVM_INCLUDE_BENCHMARKS=OFF \
+  # -DLLVM_ENABLE_BACKTRACES=ON \
+  # -DLLVM_INCLUDE_EXAMPLES=OFF \
+  # -DLLVM_ENABLE_ASSERTIONS=On
+  # -DBUILD_SHARED_LIBS=ON \
+
+# cmake --build "$build_dir"
+cmake --build "$build_dir"
+
+pushd "$build_dir"
+ninja install
+popd
+
+# bash scripts/build_cuda_tile.sh
diff --git a/mlir/cuda-tile/scripts/make_patch.sh b/mlir/cuda-tile/scripts/make_patch.sh
new file mode 100644
index 0000000..d82ebdd
--- /dev/null
+++ b/mlir/cuda-tile/scripts/make_patch.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+# Run under the workspace root dir
+
+diff -urN Ch7 Ch8 > scripts/patch/matmul.patch
diff --git a/mlir/cuda-tile/scripts/patch/matmul.back.patch b/mlir/cuda-tile/scripts/patch/matmul.back.patch
new file mode 100644
index 0000000..6632148
--- /dev/null
+++ b/mlir/cuda-tile/scripts/patch/matmul.back.patch
@@ -0,0 +1,373 @@
+diff -urN Ch7/CMakeLists.txt Ch8/CMakeLists.txt
+--- Ch7/CMakeLists.txt	2023-12-06 04:57:18.788273480 +0000
++++ Ch8/CMakeLists.txt	2024-10-01 13:51:09.920421616 +0000
+@@ -6,10 +6,10 @@
+
+ set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+ mlir_tablegen(ToyCombine.inc -gen-rewriters)
+-add_public_tablegen_target(ToyCh7CombineIncGen)
++add_public_tablegen_target(ToyCh8CombineIncGen)
+
+ add_executable(
+-  mlir-example-ch7
++  mlir-example-ch8
+   toyc.cpp
+   parser/AST.cpp
+   mlir/MLIRGen.cpp
+@@ -19,8 +19,8 @@
+   mlir/ShapeInferencePass.cpp
+   mlir/ToyCombine.cpp)
+
+-add_dependencies(mlir-example-ch7 ToyCh7ShapeInferenceInterfaceIncGen
+-                 ToyCh7OpsIncGen ToyCh7CombineIncGen)
++add_dependencies(mlir-example-ch8 ToyCh8ShapeInferenceInterfaceIncGen
++                 ToyCh8OpsIncGen ToyCh8CombineIncGen)
+
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
+ include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+@@ -28,7 +28,7 @@
+ get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+ get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+ target_link_libraries(
+-  mlir-example-ch7
++  mlir-example-ch8
+   PRIVATE ${dialect_libs}
+           ${conversion_libs}
+           ${extension_libs}
+diff -urN Ch7/include/toy/AST.h Ch8/include/toy/AST.h
+--- Ch7/include/toy/AST.h	2024-09-22 10:55:44.710339034 +0000
++++ Ch8/include/toy/AST.h	2024-10-01 13:51:14.420421786 +0000
+@@ -20,9 +20,9 @@
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/StringRef.h"
+ #include "llvm/Support/Casting.h"
++#include <optional>
+ #include <utility>
+ #include <vector>
+-#include <optional>
+
+ namespace toy {
+
+diff -urN Ch7/include/toy/CMakeLists.txt Ch8/include/toy/CMakeLists.txt
+--- Ch7/include/toy/CMakeLists.txt	2023-12-06 04:57:18.788273480 +0000
++++ Ch8/include/toy/CMakeLists.txt	2024-10-01 13:51:15.848421840 +0000
+@@ -4,10 +4,10 @@
+ mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+ mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
+ mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
+-add_public_tablegen_target(ToyCh7OpsIncGen)
++add_public_tablegen_target(ToyCh8OpsIncGen)
+
+ # Most dialects should use add_mlir_interfaces().
+ set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td)
+ mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls)
+ mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs)
+-add_public_tablegen_target(ToyCh7ShapeInferenceInterfaceIncGen)
++add_public_tablegen_target(ToyCh8ShapeInferenceInterfaceIncGen)
+diff -urN Ch7/include/toy/Ops.td Ch8/include/toy/Ops.td
+--- Ch7/include/toy/Ops.td	2024-09-22 10:55:44.710339034 +0000
++++ Ch8/include/toy/Ops.td	2024-10-01 13:51:17.112421888 +0000
+@@ -450,4 +450,31 @@
+   let hasVerifier = 1;
+ }
+
++//===----------------------------------------------------------------------===//
++// MatMul Op
++//===----------------------------------------------------------------------===//
++
++def MatMulOp : Toy_Op<"matmul",
++    [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
++  let summary = "matrix multiplication operation";
++  let description = [{
++    The "matmul" operation performs Matrix multiplication between two
++    tensors. The shapes of the tensor operands are expected to match.
++  }];
++
++  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
++  let results = (outs F64Tensor);
++
++  let assemblyFormat = [{
++    `(` $lhs `:` type($lhs) `,` $rhs `:` type($rhs) `)` attr-dict `to` type(results)
++  }];
++
++  // Allow building a MatMulOp with from the two input operands.
++  let builders = [
++    OpBuilder<(ins "Value":$lhs, "Value":$rhs)>
++  ];
++
++  let hasVerifier = 1;
++}
++
+ #endif // TOY_OPS
+diff -urN Ch7/include/toy/Parser.h Ch8/include/toy/Parser.h
+--- Ch7/include/toy/Parser.h	2024-09-22 10:55:44.714339101 +0000
++++ Ch8/include/toy/Parser.h	2024-10-01 13:51:18.412421937 +0000
+@@ -22,9 +22,9 @@
+ #include "llvm/Support/raw_ostream.h"
+
+ #include <map>
++#include <optional>
+ #include <utility>
+ #include <vector>
+-#include <optional>
+
+ namespace toy {
+
+diff -urN Ch7/matmul.toy Ch8/matmul.toy
+--- Ch7/matmul.toy	1970-01-01 00:00:00.000000000 +0000
++++ Ch8/matmul.toy	2024-10-01 13:51:11.744421685 +0000
+@@ -0,0 +1,14 @@
++def main() {
++  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
++  # The shape is inferred from the supplied literal.
++  var a = [[1, 2, 3], [4, 5, 6]];
++
++  # b is identical to a, the literal tensor is implicitly reshaped: defining new
++  # variables is the way to reshape tensors (element count must match).
++  var b<2, 3> = [1, 2, 3, 4, 5, 6];
++
++  # transpose() and print() are the only builtin, the following will transpose
++  # a and b and perform an element-wise multiplication before printing the result.
++  # print(a * b + b);
++  print(matmul(a, transpose(b)));
++}
+diff -urN Ch7/matmul.toy.mlir Ch8/matmul.toy.mlir
+--- Ch7/matmul.toy.mlir	1970-01-01 00:00:00.000000000 +0000
++++ Ch8/matmul.toy.mlir	2024-10-01 13:51:13.056421735 +0000
+@@ -0,0 +1,16 @@
++toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
++  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
++  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
++  %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64>
++  toy.return %2 : tensor<*xf64>
++}
++
++toy.func @main() {
++  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
++  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
++  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
++  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64>
++  %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64>
++  toy.print %4 : tensor<*xf64>
++  toy.return
++}
+diff -urN Ch7/mlir/Dialect.cpp Ch8/mlir/Dialect.cpp
+--- Ch7/mlir/Dialect.cpp	2024-09-22 10:55:44.714339101 +0000
++++ Ch8/mlir/Dialect.cpp	2024-10-01 13:51:19.988421996 +0000
+@@ -13,6 +13,7 @@
+
+ #include "toy/Dialect.h"
+
++#include "mlir/Dialect/Arith/Utils/Utils.h"
+ #include "mlir/IR/Attributes.h"
+ #include "mlir/IR/Builders.h"
+ #include "mlir/IR/BuiltinAttributes.h"
+@@ -429,7 +430,8 @@
+   auto resultType = results.front();
+
+   // Check that the result type of the function matches the operand type.
+-  if (inputType == resultType || llvm::isa<mlir::UnrankedTensorType>(inputType) ||
++  if (inputType == resultType ||
++      llvm::isa<mlir::UnrankedTensorType>(inputType) ||
+       llvm::isa<mlir::UnrankedTensorType>(resultType))
+     return mlir::success();
+
+@@ -497,6 +499,58 @@
+   return mlir::success();
+ }
+
++//===----------------------------------------------------------------------===//
++// MatMulOp
++//===----------------------------------------------------------------------===//
++
++void MatMulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
++                     mlir::Value lhs, mlir::Value rhs) {
++  state.addTypes(UnrankedTensorType::get(builder.getF64Type()));
++  state.addOperands({lhs, rhs});
++}
++
++/// Infer the output shape of the MatMulOp, this is required by the shape
++/// inference interface.
++void MatMulOp::inferShapes() {
++  RankedTensorType lhsType =
++      llvm::dyn_cast<RankedTensorType>(getLhs().getType());
++  RankedTensorType rhsType =
++      llvm::dyn_cast<RankedTensorType>(getRhs().getType());
++  auto lhsShape = lhsType.getShape();
++  auto rhsShape = rhsType.getShape();
++  RankedTensorType res_type = RankedTensorType::get({lhsShape[0], rhsShape[1]},
++                                                    lhsType.getElementType());
++  getResult().setType(res_type);
++}
++
++llvm::LogicalResult MatMulOp::verify() {
++  auto lhsType = llvm::dyn_cast<RankedTensorType>(getLhs().getType());
++  auto rhsType = llvm::dyn_cast<RankedTensorType>(getRhs().getType());
++  auto resultType = llvm::dyn_cast<RankedTensorType>(getType());
++
++  if (!lhsType || !rhsType || !resultType)
++    return mlir::success();
++
++  auto lhsShape = lhsType.getShape();
++  auto rhsShape = rhsType.getShape();
++
++  if (lhsShape.size() != 2 || rhsShape.size() != 2) {
++    return emitOpError() << "expected 2D matrix";
++  }
++
++  if (lhsShape[1] != rhsShape[0]) {
++    return emitOpError() << "expected dimension to match"
++                         << "the shape of lhs is [" << lhsShape[0] << ", "
++                         << lhsShape[1] << "] "
++                         << "the shape of rhs is [" << rhsShape[0] << ", "
++                         << rhsShape[1] << "] "
++                         << "but the dimension " << lhsShape[1]
++                         << "!=" << rhsShape[0] << '\n';
++  }
++
++  return mlir::success();
++}
++
+ //===----------------------------------------------------------------------===//
+ // Toy Types
+ //===----------------------------------------------------------------------===//
+diff -urN Ch7/mlir/LowerToAffineLoops.cpp Ch8/mlir/LowerToAffineLoops.cpp
+--- Ch7/mlir/LowerToAffineLoops.cpp	2024-09-22 10:55:44.714339101 +0000
++++ Ch8/mlir/LowerToAffineLoops.cpp	2024-10-01 13:51:21.668422059 +0000
+@@ -19,6 +19,7 @@
+ #include "mlir/IR/Diagnostics.h"
+ #include "mlir/IR/DialectRegistry.h"
+ #include "mlir/IR/PatternMatch.h"
++#include "mlir/IR/Value.h"
+ #include "mlir/IR/ValueRange.h"
+ #include "mlir/Support/LLVM.h"
+ #include "mlir/Support/TypeID.h"
+@@ -31,6 +32,7 @@
+ #include "mlir/Dialect/MemRef/IR/MemRef.h"
+ #include "mlir/Pass/Pass.h"
+ #include "mlir/Transforms/DialectConversion.h"
++#include "llvm/ADT/APFloat.h"
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/STLExtras.h"
+ #include "llvm/ADT/Sequence.h"
+@@ -315,6 +317,91 @@
+   }
+ };
+
++//===----------------------------------------------------------------------===//
++// ToyToAffine RewritePatterns: MatMul operations
++//===----------------------------------------------------------------------===//
++
++struct MatMulOpLowering : public ConversionPattern {
++  MatMulOpLowering(MLIRContext *ctx)
++      : ConversionPattern(toy::MatMulOp::getOperationName(), 1, ctx) {}
++
++  LogicalResult
++  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
++                  ConversionPatternRewriter &rewriter) const final {
++    auto loc = op->getLoc();
++
++    RankedTensorType lhsType =
++        llvm::dyn_cast<RankedTensorType>(op->getOperand(0).getType());
++    RankedTensorType rhsType =
++        llvm::dyn_cast<RankedTensorType>(op->getOperand(1).getType());
++    auto lhsShape = lhsType.getShape();
++    auto rhsShape = rhsType.getShape();
++
++    auto tensorType =
++        llvm::dyn_cast<RankedTensorType>((*op->result_type_begin()));
++
++    auto elemType = llvm::dyn_cast<FloatType>(tensorType.getElementType());
++
++    // Insert an allocation and deallocation for the result of this operation.
++    auto memRefType = convertTensorToMemRef(tensorType);
++    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
++
++    SmallVector<int64_t, 4> lowerBounds(tensorType.getRank() + 1, /*Value=*/0);
++    SmallVector<int64_t, 4> steps(tensorType.getRank() + 1, /*Value=*/1);
++    SmallVector<int64_t, 4> upperBounds{lhsShape[0], rhsShape[0], rhsShape[1]};
++
++    // add initialization of result tensor.
++    // Create a nest of affine loops to initialize the result tensor to 0.
++    affine::buildAffineLoopNest(
++        rewriter, loc, {0, 0}, tensorType.getShape(), {1, 1},
++        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
++          // Create a constant float value of 0.0.
++          auto valueToStore = nestedBuilder.create<arith::ConstantFloatOp>(
++              loc, llvm::APFloat(0.0), elemType);
++          // Store the constant value into the allocated memory.
++          nestedBuilder.create<affine::AffineStoreOp>(loc, valueToStore, alloc,
++                                                      ivs);
++        });
++
++    // Create a nest of affine loops for matrix multiplication.
++    affine::buildAffineLoopNest(
++        rewriter, loc, lowerBounds, upperBounds, steps,
++        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
++          // Extract loop induction variables.
++          Value m = ivs[0];
++          Value k = ivs[1];
++          Value n = ivs[2];
++
++          // Create an adaptor for the remapped operands of the MatMulOp.
++          toy::MatMulOpAdaptor matmulAdaptor(operands);
++
++          // Load elements from the left-hand side and right-hand side matrices.
++          auto loadedLhs = nestedBuilder.create<affine::AffineLoadOp>(
++              loc, matmulAdaptor.getLhs(), ValueRange{m, k});
++          auto loadedRhs = nestedBuilder.create<affine::AffineLoadOp>(
++              loc, matmulAdaptor.getRhs(), ValueRange{k, n});
++          // Load elements from the result tensor from initial process above.
++          auto loadedRes = nestedBuilder.create<affine::AffineLoadOp>(
++              loc, alloc, ValueRange{m, n});
++
++          // Perform the multiplication and addition operations.
++          auto mulop =
++              nestedBuilder.create<arith::MulFOp>(loc, loadedLhs, loadedRhs);
++          auto valueToStore =
++              nestedBuilder.create<arith::AddFOp>(loc, loadedRes, mulop);
++
++          // Store the result back into the allocated memory.
++          nestedBuilder.create<affine::AffineStoreOp>(loc, valueToStore, alloc,
++                                                      ValueRange{m, n});
++        });
++
++    // Replace this operation with the generated alloc.
++    rewriter.replaceOp(op, alloc);
++
++    return success();
++  }
++};
++
+ } // namespace
+
+ //===----------------------------------------------------------------------===//
+@@ -365,8 +452,8 @@
+   // the set of patterns that will lower the Toy operations.
+   RewritePatternSet patterns(&getContext());
+   patterns.add<AddOpLowering, ConstantOpLowering, FuncOpLowering, MulOpLowering,
+-               PrintOpLowering, ReturnOpLowering, TransposeOpLowering>(
+-      &getContext());
++               PrintOpLowering, ReturnOpLowering, TransposeOpLowering,
++               MatMulOpLowering>(&getContext());
+
+   // With the target and rewrite patterns defined, we can now attempt the
+   // conversion. The conversion will signal failure if any of our `illegal`
+diff -urN Ch7/mlir/MLIRGen.cpp Ch8/mlir/MLIRGen.cpp
+--- Ch7/mlir/MLIRGen.cpp	2024-09-22 10:55:44.714339101 +0000
++++ Ch8/mlir/MLIRGen.cpp	2024-10-01 13:51:23.564422131 +0000
+@@ -525,6 +525,14 @@
+       return builder.create<TransposeOp>(location, operands[0]);
+     }
+
++    if (callee == "matmul") {
++      if (call.getArgs().size() != 2) {
++        emitError(location, "MLIR codegen encountered an error: toy.matmul "
++                            "expected 2 arguments");
++      }
++      return builder.create<MatMulOp>(location, operands[0], operands[1]);
++    }
++
+     // Otherwise this is a call to a user-defined function. Calls to
+     // user-defined functions are mapped to a custom call that takes the callee
+     // name as an attribute.
diff --git a/mlir/cuda-tile/scripts/patch/matmul.patch b/mlir/cuda-tile/scripts/patch/matmul.patch
new file mode 100644
index 0000000..d3f09d6
--- /dev/null
+++ b/mlir/cuda-tile/scripts/patch/matmul.patch
@@ -0,0 +1,375 @@
+diff -urN Ch7/CMakeLists.txt Ch8/CMakeLists.txt
+--- Ch7/CMakeLists.txt	2025-12-29 12:11:15.106203203 +0000
++++ Ch8/CMakeLists.txt	2025-12-29 12:11:15.110203203 +0000
+@@ -6,10 +6,10 @@
+ 
+ set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td)
+ mlir_tablegen(ToyCombine.inc -gen-rewriters)
+-add_public_tablegen_target(ToyCh7CombineIncGen)
++add_public_tablegen_target(ToyCh8CombineIncGen)
+ 
+ add_executable(
+-  mlir-example-ch7
++  mlir-example-ch8
+   toyc.cpp
+   parser/AST.cpp
+   mlir/MLIRGen.cpp
+@@ -19,8 +19,8 @@
+   mlir/ShapeInferencePass.cpp
+   mlir/ToyCombine.cpp)
+ 
+-add_dependencies(mlir-example-ch7 ToyCh7ShapeInferenceInterfaceIncGen
+-                 ToyCh7OpsIncGen ToyCh7CombineIncGen)
++add_dependencies(mlir-example-ch8 ToyCh8ShapeInferenceInterfaceIncGen
++                 ToyCh8OpsIncGen ToyCh8CombineIncGen)
+ 
+ include_directories(${CMAKE_CURRENT_BINARY_DIR})
+ include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/)
+@@ -28,7 +28,7 @@
+ get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+ get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+ target_link_libraries(
+-  mlir-example-ch7
++  mlir-example-ch8
+   PRIVATE ${dialect_libs}
+           ${conversion_libs}
+           ${extension_libs}
+diff -urN Ch7/include/toy/AST.h Ch8/include/toy/AST.h
+--- Ch7/include/toy/AST.h	2025-12-29 12:11:15.107203203 +0000
++++ Ch8/include/toy/AST.h	2025-12-29 12:11:15.110203203 +0000
+@@ -20,9 +20,9 @@
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/StringRef.h"
+ #include "llvm/Support/Casting.h"
++#include <optional>
+ #include <utility>
+ #include <vector>
+-#include <optional>
+ 
+ namespace toy {
+ 
+diff -urN Ch7/include/toy/CMakeLists.txt Ch8/include/toy/CMakeLists.txt
+--- Ch7/include/toy/CMakeLists.txt	2025-12-29 12:11:15.107203203 +0000
++++ Ch8/include/toy/CMakeLists.txt	2025-12-29 12:11:15.110203203 +0000
+@@ -4,10 +4,10 @@
+ mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+ mlir_tablegen(Dialect.h.inc -gen-dialect-decls)
+ mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs)
+-add_public_tablegen_target(ToyCh7OpsIncGen)
++add_public_tablegen_target(ToyCh8OpsIncGen)
+ 
+ # Most dialects should use add_mlir_interfaces().
+ set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td)
+ mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls)
+ mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs)
+-add_public_tablegen_target(ToyCh7ShapeInferenceInterfaceIncGen)
++add_public_tablegen_target(ToyCh8ShapeInferenceInterfaceIncGen)
+diff -urN Ch7/include/toy/Ops.td Ch8/include/toy/Ops.td
+--- Ch7/include/toy/Ops.td	2025-12-29 12:11:15.108203203 +0000
++++ Ch8/include/toy/Ops.td	2025-12-29 12:11:15.111203203 +0000
+@@ -450,4 +450,33 @@
+   let hasVerifier = 1;
+ }
+ 
++//===----------------------------------------------------------------------===//
++// MatMul Op
++//===----------------------------------------------------------------------===//
++
++def MatMulOp : Toy_Op<"matmul",
++    [Pure, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, MemoryEffectsOpInterface]> {
++  let summary = "matrix multiplication operation";
++  let description = [{
++    The "matmul" operation performs Matrix multiplication between two
++    tensors. The shapes of the tensor operands are expected to match.
++  }];
++
++  let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs);
++  let results = (outs Res<F64Tensor, "",
++                          [MemWrite<DefaultResource>,
++                           MemAlloc<DefaultResource>]>:$output);
++
++  let assemblyFormat = [{
++    `(` $lhs `:` type($lhs) `,` $rhs `:` type($rhs) `)` attr-dict `to` type(results)
++  }];
++
++  // Allow building a MatMulOp with from the two input operands.
++  let builders = [
++    OpBuilder<(ins "Value":$lhs, "Value":$rhs)>
++  ];
++
++  let hasVerifier = 1;
++}
++
+ #endif // TOY_OPS
+diff -urN Ch7/include/toy/Parser.h Ch8/include/toy/Parser.h
+--- Ch7/include/toy/Parser.h	2025-12-29 12:11:15.108203203 +0000
++++ Ch8/include/toy/Parser.h	2025-12-29 12:11:15.111203203 +0000
+@@ -22,9 +22,9 @@
+ #include "llvm/Support/raw_ostream.h"
+ 
+ #include <map>
++#include <optional>
+ #include <utility>
+ #include <vector>
+-#include <optional>
+ 
+ namespace toy {
+ 
+diff -urN Ch7/matmul.toy Ch8/matmul.toy
+--- Ch7/matmul.toy	1970-01-01 00:00:00.000000000 +0000
++++ Ch8/matmul.toy	2025-12-29 12:11:15.111203203 +0000
+@@ -0,0 +1,14 @@
++def main() {
++  # Define a variable `a` with shape <2, 3>, initialized with the literal value.
++  # The shape is inferred from the supplied literal.
++  var a = [[1, 2, 3], [4, 5, 6]];
++
++  # b is identical to a, the literal tensor is implicitly reshaped: defining new
++  # variables is the way to reshape tensors (element count must match).
++  var b<2, 3> = [1, 2, 3, 4, 5, 6];
++
++  # transpose() and print() are the only builtin, the following will transpose
++  # a and b and perform an element-wise multiplication before printing the result.
++  # print(a * b + b);
++  print(matmul(a, transpose(b)));
++}
+diff -urN Ch7/matmul.toy.mlir Ch8/matmul.toy.mlir
+--- Ch7/matmul.toy.mlir	1970-01-01 00:00:00.000000000 +0000
++++ Ch8/matmul.toy.mlir	2025-12-29 12:11:15.111203203 +0000
+@@ -0,0 +1,16 @@
++toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> {
++  %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64>
++  %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64>
++  %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64>
++  toy.return %2 : tensor<*xf64>
++}
++
++toy.func @main() {
++  %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
++  %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64>
++  %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64>
++  %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64>
++  %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64>
++  toy.print %4 : tensor<*xf64>
++  toy.return
++}
+diff -urN Ch7/mlir/Dialect.cpp Ch8/mlir/Dialect.cpp
+--- Ch7/mlir/Dialect.cpp	2025-12-29 12:11:15.108203203 +0000
++++ Ch8/mlir/Dialect.cpp	2025-12-29 12:11:15.111203203 +0000
+@@ -13,6 +13,7 @@
+ 
+ #include "toy/Dialect.h"
+ 
++#include "mlir/Dialect/Arith/Utils/Utils.h"
+ #include "mlir/IR/Attributes.h"
+ #include "mlir/IR/Builders.h"
+ #include "mlir/IR/BuiltinAttributes.h"
+@@ -429,7 +430,8 @@
+   auto resultType = results.front();
+ 
+   // Check that the result type of the function matches the operand type.
+-  if (inputType == resultType || llvm::isa<mlir::UnrankedTensorType>(inputType) ||
++  if (inputType == resultType ||
++      llvm::isa<mlir::UnrankedTensorType>(inputType) ||
+       llvm::isa<mlir::UnrankedTensorType>(resultType))
+     return mlir::success();
+ 
+@@ -497,6 +499,58 @@
+   return mlir::success();
+ }
+ 
++//===----------------------------------------------------------------------===//
++// MatMulOp
++//===----------------------------------------------------------------------===//
++
++void MatMulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
++                     mlir::Value lhs, mlir::Value rhs) {
++  state.addTypes(UnrankedTensorType::get(builder.getF64Type()));
++  state.addOperands({lhs, rhs});
++}
++
++/// Infer the output shape of the MatMulOp, this is required by the shape
++/// inference interface.
++void MatMulOp::inferShapes() {
++  RankedTensorType lhsType =
++      llvm::dyn_cast<RankedTensorType>(getLhs().getType());
++  RankedTensorType rhsType =
++      llvm::dyn_cast<RankedTensorType>(getRhs().getType());
++  auto lhsShape = lhsType.getShape();
++  auto rhsShape = rhsType.getShape();
++  RankedTensorType res_type = RankedTensorType::get({lhsShape[0], rhsShape[1]},
++                                                    lhsType.getElementType());
++  getResult().setType(res_type);
++}
++
++llvm::LogicalResult MatMulOp::verify() {
++  auto lhsType = llvm::dyn_cast<RankedTensorType>(getLhs().getType());
++  auto rhsType = llvm::dyn_cast<RankedTensorType>(getRhs().getType());
++  auto resultType = llvm::dyn_cast<RankedTensorType>(getType());
++
++  if (!lhsType || !rhsType || !resultType)
++    return mlir::success();
++
++  auto lhsShape = lhsType.getShape();
++  auto rhsShape = rhsType.getShape();
++
++  if (lhsShape.size() != 2 || rhsShape.size() != 2) {
++    return emitOpError() << "expected 2D matrix";
++  }
++
++  if (lhsShape[1] != rhsShape[0]) {
++    return emitOpError() << "expected dimension to match"
++                         << "the shape of lhs is [" << lhsShape[0] << ", "
++                         << lhsShape[1] << "] "
++                         << "the shape of rhs is [" << rhsShape[0] << ", "
++                         << rhsShape[1] << "] "
++                         << "but the dimension " << lhsShape[1]
++                         << "!=" << rhsShape[0] << '\n';
++  }
++
++  return mlir::success();
++}
++
+ //===----------------------------------------------------------------------===//
+ // Toy Types
+ //===----------------------------------------------------------------------===//
+diff -urN Ch7/mlir/LowerToAffineLoops.cpp Ch8/mlir/LowerToAffineLoops.cpp
+--- Ch7/mlir/LowerToAffineLoops.cpp	2025-12-29 12:11:15.109203203 +0000
++++ Ch8/mlir/LowerToAffineLoops.cpp	2025-12-29 12:11:15.112203203 +0000
+@@ -19,6 +19,7 @@
+ #include "mlir/IR/Diagnostics.h"
+ #include "mlir/IR/DialectRegistry.h"
+ #include "mlir/IR/PatternMatch.h"
++#include "mlir/IR/Value.h"
+ #include "mlir/IR/ValueRange.h"
+ #include "mlir/Support/LLVM.h"
+ #include "mlir/Support/TypeID.h"
+@@ -31,6 +32,7 @@
+ #include "mlir/Dialect/MemRef/IR/MemRef.h"
+ #include "mlir/Pass/Pass.h"
+ #include "mlir/Transforms/DialectConversion.h"
++#include "llvm/ADT/APFloat.h"
+ #include "llvm/ADT/ArrayRef.h"
+ #include "llvm/ADT/STLExtras.h"
+ #include "llvm/ADT/Sequence.h"
+@@ -315,6 +317,91 @@
+   }
+ };
+ 
++//===----------------------------------------------------------------------===//
++// ToyToAffine RewritePatterns: MatMul operations
++//===----------------------------------------------------------------------===//
++
++struct MatMulOpLowering : public ConversionPattern {
++  MatMulOpLowering(MLIRContext *ctx)
++      : ConversionPattern(toy::MatMulOp::getOperationName(), 1, ctx) {}
++
++  LogicalResult
++  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
++                  ConversionPatternRewriter &rewriter) const final {
++    auto loc = op->getLoc();
++
++    RankedTensorType lhsType =
++        llvm::dyn_cast<RankedTensorType>(op->getOperand(0).getType());
++    RankedTensorType rhsType =
++        llvm::dyn_cast<RankedTensorType>(op->getOperand(1).getType());
++    auto lhsShape = lhsType.getShape();
++    auto rhsShape = rhsType.getShape();
++
++    auto tensorType =
++        llvm::dyn_cast<RankedTensorType>((*op->result_type_begin()));
++
++    auto elemType = llvm::dyn_cast<FloatType>(tensorType.getElementType());
++
++    // Insert an allocation and deallocation for the result of this operation.
++    auto memRefType = convertTensorToMemRef(tensorType);
++    auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter);
++
++    SmallVector<int64_t, 4> lowerBounds(tensorType.getRank() + 1, /*Value=*/0);
++    SmallVector<int64_t, 4> steps(tensorType.getRank() + 1, /*Value=*/1);
++    SmallVector<int64_t, 4> upperBounds{lhsShape[0], rhsShape[0], rhsShape[1]};
++
++    // add initialization of result tensor.
++    // Create a nest of affine loops to initialize the result tensor to 0.
++    affine::buildAffineLoopNest(
++        rewriter, loc, {0, 0}, tensorType.getShape(), {1, 1},
++        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
++          // Create a constant float value of 0.0.
++          auto valueToStore = nestedBuilder.create<arith::ConstantFloatOp>(
++              loc, llvm::APFloat(0.0), elemType);
++          // Store the constant value into the allocated memory.
++          nestedBuilder.create<affine::AffineStoreOp>(loc, valueToStore, alloc,
++                                                      ivs);
++        });
++
++    // Create a nest of affine loops for matrix multiplication.
++    affine::buildAffineLoopNest(
++        rewriter, loc, lowerBounds, upperBounds, steps,
++        [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
++          // Extract loop induction variables.
++          Value m = ivs[0];
++          Value k = ivs[1];
++          Value n = ivs[2];
++
++          // Create an adaptor for the remapped operands of the MatMulOp.
++          toy::MatMulOpAdaptor matmulAdaptor(operands);
++
++          // Load elements from the left-hand side and right-hand side matrices.
++          auto loadedLhs = nestedBuilder.create<affine::AffineLoadOp>(
++              loc, matmulAdaptor.getLhs(), ValueRange{m, k});
++          auto loadedRhs = nestedBuilder.create<affine::AffineLoadOp>(
++              loc, matmulAdaptor.getRhs(), ValueRange{k, n});
++          // Load elements from the result tensor from initial process above.
++          auto loadedRes = nestedBuilder.create<affine::AffineLoadOp>(
++              loc, alloc, ValueRange{m, n});
++
++          // Perform the multiplication and addition operations.
++          auto mulop =
++              nestedBuilder.create<arith::MulFOp>(loc, loadedLhs, loadedRhs);
++          auto valueToStore =
++              nestedBuilder.create<arith::AddFOp>(loc, loadedRes, mulop);
++
++          // Store the result back into the allocated memory.
++          nestedBuilder.create<affine::AffineStoreOp>(loc, valueToStore, alloc,
++                                                      ValueRange{m, n});
++        });
++
++    // Replace this operation with the generated alloc.
++    rewriter.replaceOp(op, alloc);
++
++    return success();
++  }
++};
++
+ } // namespace
+ 
+ //===----------------------------------------------------------------------===//
+@@ -365,8 +452,8 @@
+   // the set of patterns that will lower the Toy operations.
+   RewritePatternSet patterns(&getContext());
+   patterns.add<AddOpLowering, ConstantOpLowering, FuncOpLowering, MulOpLowering,
+-               PrintOpLowering, ReturnOpLowering, TransposeOpLowering>(
+-      &getContext());
++               PrintOpLowering, ReturnOpLowering, TransposeOpLowering,
++               MatMulOpLowering>(&getContext());
+ 
+   // With the target and rewrite patterns defined, we can now attempt the
+   // conversion. The conversion will signal failure if any of our `illegal`
+diff -urN Ch7/mlir/MLIRGen.cpp Ch8/mlir/MLIRGen.cpp
+--- Ch7/mlir/MLIRGen.cpp	2025-12-29 12:11:15.109203203 +0000
++++ Ch8/mlir/MLIRGen.cpp	2025-12-29 12:11:15.112203203 +0000
+@@ -525,6 +525,14 @@
+       return builder.create<TransposeOp>(location, operands[0]);
+     }
+ 
++    if (callee == "matmul") {
++      if (call.getArgs().size() != 2) {
++        emitError(location, "MLIR codegen encountered an error: toy.matmul "
++                            "expected 2 arguments");
++      }
++      return builder.create<MatMulOp>(location, operands[0], operands[1]);
++    }
++
+     // Otherwise this is a call to a user-defined function. Calls to
+     // user-defined functions are mapped to a custom call that takes the callee
+     // name as an attribute.
diff --git a/mlir/cuda-tile/scripts/sync_deps.sh b/mlir/cuda-tile/scripts/sync_deps.sh
new file mode 100644
index 0000000..1a383b3
--- /dev/null
+++ b/mlir/cuda-tile/scripts/sync_deps.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+mkdir -p third_party
+
+git clone --filter=blob:none --no-checkout https://github.com/llvm/llvm-project.git third_party/llvm-project
+cd third_party/llvm-project
+
+git fetch --depth=1 origin cfbb4cc31215d615f605466aef0bcfb42aa9faa5
+git checkout --detach cfbb4cc31215d615f605466aef0bcfb42aa9faa5
+
+cd -
+
+git clone https://github.com/Alwaysproblem/cuda-tile third_party/cuda-tile
diff --git a/mlir/cuda-tile/scripts/update.sh b/mlir/cuda-tile/scripts/update.sh
new file mode 100644
index 0000000..e6cf698
--- /dev/null
+++ b/mlir/cuda-tile/scripts/update.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+WORKSPACE=`pwd`
+
+_llvm_branch=${1:-"release/19.x"}
+
+_dirs="Ch1 Ch2 Ch3 Ch4 Ch5 Ch6 Ch7"
+_transform_dirs="Ch2 Ch3 Ch4"
+
+_example_in_llvm_project="third_party/llvm-project/mlir/examples"
+
+_mlir_example_dir="${_example_in_llvm_project}/toy"
+_mlir_transform_dir="${_example_in_llvm_project}/transform"
+
+[[ -d "third_party/llvm-project" ]] || git clone -b $_llvm_branch https://github.com/llvm/llvm-project.git third_party/llvm-project
+
+# update the mlir Toy examples
+
+for dir in $_dirs; do
+
+  pushd "$WORKSPACE/$dir"
+    rm -rf $(find ./ -name "*.cpp")
+    rm -rf $(find ./ -name "*.h")
+    rm -rf $(find ./ -name "*.td")
+  popd
+
+  pushd "$WORKSPACE/${_mlir_example_dir}/$dir"
+
+    for cpps in $(find ./ -name "*.cpp"); do
+      cp ${cpps} "$WORKSPACE/$dir/${cpps}"
+    done
+
+    for hs in $(find ./ -name "*.h"); do
+      cp ${hs} "$WORKSPACE/$dir/${hs}"
+    done
+
+    for tds in $(find ./ -name "*.td"); do
+      cp ${tds} "$WORKSPACE/$dir/${tds}"
+    done
+
+  popd
+
+done
+
+# update the mlir transform examples
+
+for tdir in $_transform_dirs; do
+
+  pushd "$WORKSPACE/transform_$tdir"
+    rm -rf $(find ./ -name "*.cpp")
+    rm -rf $(find ./ -name "*.h")
+    rm -rf $(find ./ -name "*.td")
+  popd
+
+  pushd "$WORKSPACE/${_mlir_transform_dir}/$tdir"
+
+    for cpps in $(find ./ -name "*.cpp"); do
+      cp ${cpps} "$WORKSPACE/transform_$tdir/${cpps}"
+      # echo "cp ${cpps} $WORKSPACE/transform_$tdir/${cpps}"
+    done
+
+    for hs in $(find ./ -name "*.h"); do
+      cp ${hs} "$WORKSPACE/transform_$tdir/${hs}"
+      # echo "cp ${hs} $WORKSPACE/transform_$tdir/${hs}"
+    done
+
+    for tds in $(find ./ -name "*.td"); do
+      cp ${tds} "$WORKSPACE/transform_$tdir/${tds}"
+      # echo "cp ${tds} $WORKSPACE/transform_$tdir/${tds}"
+    done
+
+  popd
+
+done
diff --git a/mlir/cuda-tile/vscode/.container_zsh_history b/mlir/cuda-tile/vscode/.container_zsh_history
new file mode 100644
index 0000000..b53f250
--- /dev/null
+++ b/mlir/cuda-tile/vscode/.container_zsh_history
@@ -0,0 +1,598 @@
+: 1766924709:0;lsa
+: 1766924743:0;cd dockerVolumn
+: 1766924744:0;lsa
+: 1766924749:0;rm -rf cuda-tile
+: 1766924758:0;setup_new_user 1000 1000
+: 1766924767:0;git config --global --add safe.directory '*'
+: 1766924791:0;git clone https://github.com/Alwaysproblem/cuda-tile
+: 1766924807:0;git checkout dev
+: 1766924812:0;cd cuda-tile
+: 1766924814:0;git checkout dev
+: 1766924904:0;apt install -yq software-properties-common
+: 1766924921:0;apt update -y && apt install -yq software-properties-common 
+: 1766924955:0;apt install -yq gcc-13 g++-13
+: 1766924972:0;g++ --version
+: 1766924994:0;add-apt-repository -y ppa:ubuntu-toolchain-r/test
+: 1766925008:0;apt update -y
+: 1766925020:0;apt install -yq gcc-13 g++-13
+: 1766925151:0;apt install -y \\
+    python3 python3-dev python3-setuptools python3-pip \\
+    libtinfo-dev zlib1g-dev \\
+    build-essential cmake ninja-build
+: 1766925174:0;apt install -yq libfmt-dev libspdlog-dev
+: 1766925183:0;apt install -yq gcc-13 g++-13
+: 1766925221:0;update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 20
+: 1766925227:0;update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 20
+: 1766925256:0;python3 --version
+: 1766925275:0;sudo vim /etc/apt/sources.list.d/llvm.list
+: 1766925279:0;vim /etc/apt/sources.list.d/llvm.list
+: 1766925344:0;wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
+: 1766925351:0;apt update -y
+: 1766925388:0;python3 -m pip install pre-commit compdb
+: 1766925427:0;cmake  --version
+: 1766925470:0;apt install -yq clang-20 clang-tidy-20 clangd-20 cmake-format \\
+    clang-format-20 lldb-20
+: 1766926219:0;update-alternatives --install /usr/bin/clang clang /usr/bin/clang-20 100
+: 1766926230:0;update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-20 100
+: 1766926240:0;update-alternatives --install /usr/bin/clangd clangd /usr/bin/clangd-20 100
+: 1766926250:0;update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-20 100
+: 1766926259:0;update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-20 100
+: 1766926269:0;update-alternatives --install /usr/bin/lld lld /usr/bin/lld-20 100
+: 1766926289:0;apt install -yq lld-20
+: 1766926295:0;update-alternatives --install /usr/bin/lld lld /usr/bin/lld-20 100
+: 1766926306:0;update-alternatives --install /usr/bin/lldb lldb /usr/bin/lldb-20 100
+: 1766926321:0;update-alternatives --install /usr/bin/lldb-dap lldb-dap /usr/bin/lldb-dap-20 100
+: 1766926327:0;clang++ --version 
+: 1766926448:0;lsa
+: 1766926460:0;chown -R scotty:scotty ./
+: 1766926461:0;lsa
+: 1766926704:0;nvidia-smi 
+: 1766926730:0;bash build.sh
+: 1766926743:0;vim build.sh
+: 1766926765:0;bash build.sh
+: 1766926778:0;vim build.sh
+: 1766926798:0;mkdir 3rdparty
+: 1766926820:0;lsa
+: 1766926825:0;chown -R scotty:scotty ./
+: 1766926826:0;lsa
+: 1766926840:0;cd ..
+: 1766926842:0;lsa
+: 1766926852:0;cd dockerVolumn
+: 1766926853:0;lsa
+: 1766926860:0;mv * ../
+: 1766926862:0;cd ..
+: 1766926863:0;lsa
+: 1766926917:0;cp cuda-tile llvm-project MLcompiler-tutorial dockerVolumn/ -R 
+: 1766927013:0;lsa
+: 1766927028:0;cd dockerVolumn
+: 1766927029:0;lsa
+: 1766927038:0;chown -R scotty:scotty ./
+: 1766927040:0;lsa
+: 1766927075:0;cd ..
+: 1766927076:0;lsa
+: 1766927083:0;rm -rf example.cubin
+: 1766927084:0;lsa
+: 1766927179:0;cd ..
+: 1766927184:0;lsa
+: 1766927209:0;cd dockerVolumn
+: 1766927215:0;lsa
+: 1766927218:0;cd cuda-tile
+: 1766927219:0;lsa
+: 1766927224:0;cd 3rdparty/llvm-project
+: 1766927233:0;git status
+: 1766927251:0;git switch -c cfbb4cc3
+: 1766927257:0;cd ..
+: 1766927258:0;lsa
+: 1766927260:0;cd ..
+: 1766927260:0;lsa
+: 1766927281:0;bash build.sh
+: 1766927509:0;cd ..
+: 1766927511:0;lsa
+: 1766927515:0;mv * ../ 
+: 1766928047:0;cd map
+: 1766928049:0;lsa
+: 1766928054:0;chown -R scotty:scotty ./
+: 1766928055:0;lsa
+: 1766928060:0;git status
+: 1766928065:0;lsa
+: 1766928074:0;cd examples
+: 1766928075:0;lsa
+: 1766928078:0;cd map
+: 1766928080:0;lsa
+: 1766928085:0;bash build_app.sh
+: 1766928088:0;lsa
+: 1766928092:0;./example
+: 1766928110:0;nvidia-smi 
+: 1766928158:0;ldconfig -p | grep -i ptxjit
+: 1766928184:0;ls -l /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so* 2>/dev/null
+: 1766928193:0;echo $LD_LIBRARY_PATH
+: 1766928223:0;ldconfig -p | grep -i ptxjit || true
+: 1766928228:0;ls -l /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so* 2>/dev/null || true
+: 1766928266:0;ls -l /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so*
+: 1766928278:0;sudo ln -sf /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1 \\
+  /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so
+: 1766928289:0;ln -sf /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1 \\
+  /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so
+: 1766928295:0;ldconfig
+: 1766928303:0;./example
+: 1766928325:0;ldconfig -p | grep -i ptxjit
+: 1766928402:0;ldconfig -p | grep -E 'libcuda\.so|libnvidia-nvvm|libnvrtc' || true
+: 1766928412:0;ls -l /usr/lib/x86_64-linux-gnu/libcuda.so* /lib/x86_64-linux-gnu/libcuda.so* 2>/dev/null || true
+: 1766928435:0;lsa /lib/x86_64-linux-gnu/libcuda.so
+: 1766928495:0;export LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}
+: 1766928499:0;./example
+: 1766928602:0;ldd /lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1 | sed -n '1,200p'\
+ldd /lib/x86_64-linux-gnu/libcuda.so.590.48.01 | sed -n '1,200p'
+: 1766928623:0;strace -f -e trace=file -o /tmp/trace.log ./map 2>/dev/null || true
+: 1766928630:0;grep -nE 'ptxjit|nvvm|libcuda|nvidia' /tmp/trace.log | tail -n 200
+: 1766928670:0;lsa
+: 1766930165:0;cd map
+: 1766930166:0;lsa
+: 1766930170:0;bash build_app.sh
+: 1766930949:0;cd ../map2d
+: 1766930952:0;bash build_app.sh
+: 1766932113:0;vim ~/.gitconfig
+: 1766970861:0;bash build_app.sh
+: 1766974775:0;cd examples/matmul
+: 1766974777:0;bash build_app.sh
+: 1766979197:0;python3 -m pip install numpy 
+: 1766979205:0;python3 
+: 1766979327:0;bash build_app.sh
+: 1766982363:0;cd ..
+: 1766982364:0;..
+: 1766982369:0;grep -RIn --exclude-dir=.git "pipel|pipeline|peel|prologue|epilogue|modulo" .\
+
+: 1766982383:0;grep -RIn --exclude-dir=.git "pass.*pipeline|--pass|addPass|PassPipeline" .
+: 1766982412:0;./build/bin/cuda-tile-opt --help
+: 1766982641:0;cd examples/matmul
+: 1766982644:0;bash build_app.sh
+: 1766982734:0;../../build/bin/cuda-tile-opt --help
+: 1766986985:0;cd mat
+: 1766986987:0;lsa
+: 1766986994:0;bash build_app.sh
+: 1767007534:0;vim .git/info/exclude
+: 1767007865:0;cd examples/CrossTileBlockCommunication
+: 1767007868:0;bash build_app.sh
+: 1767008901:0;cd ..
+: 1767008913:0;mv fp8 tf32
+: 1767008939:0;cd tf32
+: 1767008941:0;bash build_app.sh
+: 1767009250:0;cd ..
+: 1767009251:0;lsa
+: 1767009260:0;mv tf32 bf16
+: 1767009261:0;lsa
+: 1767009267:0;chown -R scotty:scotty ./
+: 1767009268:0;lsa
+: 1767009362:0;bash build_app.sh
+: 1767009366:0;cd bf16
+: 1767009368:0;bash build_app.sh
+: 1767010475:0;bash scripts/sync_deps.sh
+: 1767010531:0;cd third_party/llvm-project
+: 1767010532:0;lsa
+: 1767010541:0;git switch -c cfbb4cc3
+: 1767010544:0;cd ..
+: 1767010545:0;lsa
+: 1767010700:0;git clone https://github.com/Alwaysproblem/cuda-tile
+: 1767010732:0;lsa
+: 1767010776:0;apt install -yq ccache 
+: 1767010792:0;apt update -y 
+: 1767010801:0;apt install -yq ccache 
+: 1767010916:0;cd ..
+: 1767010936:0;bash scripts/make_patch.sh
+: 1767011004:0;bash scripts/update.sh
+: 1767011070:0;cd Ch6
+: 1767011076:0;cd ..
+: 1767011118:0;cp -R Ch6 Ch7
+: 1767011133:0;bash scripts/apply_patch.sh
+: 1767011437:0;pwd
+: 1767011446:0;cd third_party/cuda-tile
+: 1767011448:0;lsa
+: 1767011456:0;rg "CUDA_TILE_INSTALL_DIR"
+: 1767011492:0;lsa
+: 1767011494:0;cd ..
+: 1767011494:0;lsa
+: 1767011742:0;cd ..
+: 1767011743:0;lsa
+: 1767011747:0;chown -R scotty:scotty ./
+: 1767011753:0;bash scripts/build_deps.sh
+: 1767011910:0;cd third_party/llvm-project
+: 1767011914:0;git status
+: 1767011935:0;git checkout -q -- .
+: 1767011940:0;cd ..
+: 1767011973:0;bash scripts/build_deps.sh
+: 1767012572:0;git rev-parse HEAD
+: 1767012758:0;cd ..
+: 1767012762:0;WORKSPACEROOT=`git rev-parse --show-toplevel` || WORKSPACEROOT=`pwd`
+: 1767012769:0;WORKSPACEROOT
+: 1767012773:0;echo $WORKSPACEROOT
+: 1767012787:0;cd ..
+: 1767012799:0;WORKSPACEROOT=`git rev-parse --show-toplevel` || WORKSPACEROOT=`pwd`
+: 1767012803:0;echo $WORKSPACEROOT
+: 1767012866:0;cd cuda-tile
+: 1767012868:0;pwd
+: 1767012877:0;cd ../MLcompiler-tutorial/mlir/cuda-tile
+: 1767012984:0;lsa
+: 1767012991:0;bash scripts/build_cuda_tile.sh
+: 1767013166:0;lsa
+: 1767013174:0;cd third_party/cuda-tile
+: 1767013175:0;lsa
+: 1767013186:0;pwd
+: 1767013202:0;bash scripts/build_cuda_tile.sh
+: 1767013654:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm/include
+: 1767013690:0;bash scripts/build_cuda_tile.sh
+: 1767013790:0;git status
+: 1767013797:0;cd third_party/cuda-tile
+: 1767013798:0;lsa
+: 1767013801:0;git status
+: 1767013813:0;git checkout -q -- .
+: 1767013816:0;cd ..
+: 1767013825:0;pwd
+: 1767013834:0;bash scripts/build_cuda_tile.sh
+: 1767013877:0;cd third_party/llvm-project
+: 1767013880:0;git log 
+: 1767013943:0;cd ..
+: 1767013949:0;rm -rf third_party
+: 1767013954:0;lsa
+: 1767013962:0;bash scripts/sync_deps.sh
+: 1767014037:0;lsa
+: 1767014041:0;chown -R scotty:scotty ./
+: 1767014043:0;lsa
+: 1767014057:0;rm -rf third_party/llvm-project
+: 1767014102:0;cp -R ../../../cuda-tile/3rdparty/llvm-project ./third_party
+: 1767014161:0;lsa
+: 1767014175:0;cd third_party
+: 1767014178:0;git clone https://github.com/Alwaysproblem/cuda-tile
+: 1767014184:0;cd ..
+: 1767014185:0;lsa
+: 1767014204:0;pwd
+: 1767014208:0;bash scripts/update.sh
+: 1767014248:0;bash scripts/build_deps.sh
+: 1767014279:0;cd third_party
+: 1767014283:0;cd llvm-project
+: 1767014292:0;git status
+: 1767014301:0;git checkout -q -- .
+: 1767014304:0;cd ..
+: 1767014309:0;lsa
+: 1767014353:0;bash scripts/update.sh
+: 1767014392:0;cd llvm-project
+: 1767014397:0;git checkout -q -- .
+: 1767014400:0;cd ..
+: 1767014403:0;lsa
+: 1767014424:0;bash scripts/build_deps.sh
+: 1767015527:0;bash scripts/build_cuda_tile.sh
+: 1767015541:0;third_party/llvm-project
+: 1767015543:0;cd ..
+: 1767015544:0;lsa
+: 1767015547:0;cd cuda-tile
+: 1767015549:0;lsa
+: 1767015553:0;git status
+: 1767015623:0;git checkout -q -- .
+: 1767015677:0;bash scripts/build_cuda_tile.sh
+: 1767015684:0;cd ...
+: 1767015686:0;lsa
+: 1767015688:0;bash scripts/build_cuda_tile.sh
+: 1767015733:0;cd third_party
+: 1767015734:0;lsa
+: 1767015739:0;cd cuda-tile
+: 1767015740:0;lsa
+: 1767015749:0;ninja install
+: 1767015872:0;cd build
+: 1767015878:0;cmake --install .
+: 1767016239:0;which cmake-format
+: 1767016258:0;which clang-tidy
+: 1767016671:0;cd ..
+: 1767016672:0;..
+: 1767016682:0;find ./third_party -name "mlir-tblgen"
+: 1767016905:0;pwd
+: 1767017012:0;./build/Toy/toy-cuda sample/example.toy
+: 1767017022:0;./build/Toy/toy-cuda sample/example.toy -emit=jit
+: 1767017134:0;pwd
+: 1767017545:0;git log 
+: 1767017569:0;git show 3165b0ccf63b333e7b90382b25ce76b8f998ce35
+: 1767017775:0;cd Toy
+: 1767017787:0;git apply ../scripts/patch/matmul.patch
+: 1767019507:0;僰��烺��。。
+: 1767019509:0;。。
+: 1767019510:0;..
+: 1767019512:0;lsa
+: 1767019516:0;chown -R scotty:scotty ./
+: 1767019518:0;lsa
+: 1767019520:0;pwd
+: 1767019537:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir
+: 1767019560:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767019607:0;lsa
+: 1767019656:0;python3
+: 1767019793:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767019810:0;python3 
+: 1767020030:0;...
+: 1767020033:0;pwd
+: 1767020072:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767020252:0;python3 sample/validation.py
+: 1767020283:0;sa
+: 1767020285:0;lsa
+: 1767054330:0;git log 
+: 1767071940:0;find ./ -name "libmlir_cuda_runtime.* "
+: 1767071942:0;find ./ -name "libmlir_cuda_runtime.*"
+: 1767072294:0;bash scripts/build_deps.sh
+: 1767072320:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project/third_party/llvm-project/llvm
+: 1767072341:0;bash scripts/build_deps.sh
+: 1767072393:0;nvcc --version 
+: 1767072402:0;which nvcc
+: 1767072464:0;ld --version
+: 1767072536:0;which clang
+: 1767072558:0;update-alternatives --list
+: 1767072563:0;update-alternatives --list lld
+: 1767072674:0;update-alternatives --install /usr/bin/ld.lld ld.lld /usr/bin/lldb-20 100
+: 1767072696:0;ld.lld --version 
+: 1767072705:0;lld --version 
+: 1767072728:0;bash scripts/build_deps.sh
+: 1767072770:0;g++ -fuse-ld=lld -Wl,--version 2>&1 | head -n 5
+: 1767072936:0;bash scripts/build_deps.sh
+: 1767072982:0;g++ -fno-use-linker-plugin -fuse-ld=lld -Wl,--version 2>&1 | head -n 20
+: 1767073000:0;update-alternatives --help
+: 1767073021:0;update-alternatives --list lld
+: 1767073026:0;update-alternatives --list ld.lld
+: 1767073033:0;update-alternatives --remove ld.lld
+: 1767073045:0;update-alternatives --remove ld.lld /usr/bin/ld.lld
+: 1767073050:0;ld.lld --version 
+: 1767073068:0;which ld.lld
+: 1767073084:0;update-alternatives --remove ld.lld 
+: 1767073089:0;update-alternatives --list ld.lld 
+: 1767073110:0;rm -rf /usr/bin/ld.lld
+: 1767073113:0;update-alternatives --list ld.lld 
+: 1767073137:0;update-alternatives --remove ld.lld /usr/bin/ld.lld 
+: 1767073150:0;lldb --version 
+: 1767073155:0;update-alternatives --list ld.lld 
+: 1767073188:0;ld --version
+: 1767073196:0;update-alternatives --display ld.lld
+: 1767073212:0;update-alternatives --remove ld.lld /usr/bin/lldb-20
+: 1767073214:0;update-alternatives --display ld.lld
+: 1767077436:0;bash scripts/build_deps.sh
+: 1767077457:0;apt-get install -y binutils
+: 1767077470:0;bash scripts/build_deps.sh
+: 1767077556:0;ld --version
+: 1767077622:0;cd third_party
+: 1767077626:0;cd cuda-tile
+: 1767077628:0;cd ..
+: 1767077631:0;lsa
+: 1767077697:0;ld.lld
+: 1767077720:0;apt install -yq lld
+: 1767077777:0;pwd
+: 1767077780:0;bash scripts/build_deps.sh
+: 1767077838:0;cd ./third_party/llvm-project/build
+: 1767077840:0;lsa
+: 1767077843:0;grep -R "color-diagnostics" -n build/CMakeCache.txt 2>/dev/null | head -n 20\
+env | grep -E "CFLAGS|CXXFLAGS|LDFLAGS" || true
+: 1767077846:0;cd ..
+: 1767077848:0;grep -R "color-diagnostics" -n build/CMakeCache.txt 2>/dev/null | head -n 20\
+env | grep -E "CFLAGS|CXXFLAGS|LDFLAGS" || true
+: 1767077864:0;lsa
+: 1767077881:0;cd third_party
+: 1767077882:0;lsa
+: 1767077885:0;cd llvm-project
+: 1767077886:0;lsa
+: 1767077892:0;cd build
+: 1767077893:0;ls
+: 1767077896:0;cd ..
+: 1767077898:0;lsa
+: 1767077952:0;cd ..
+: 1767077954:0;lsa
+: 1767077959:0;rm -rf third_party
+: 1767077961:0;cd ..
+: 1767077962:0;lsa
+: 1767077966:0;cd ..
+: 1767077967:0;lsa
+: 1767078003:0;bash scripts/build_deps.sh
+: 1767078013:0;lsa third_party/llvm-project/build
+: 1767078023:0;lsa third_party/llvm
+: 1767078096:0;bash scripts/build_deps.sh
+: 1767078168:0;cd third_party
+: 1767078169:0;lsa
+: 1767078172:0;cd cuda-tile
+: 1767078173:0;lsa
+: 1767078176:0;cd ..
+: 1767079230:0;cd third_party
+: 1767079233:0;cd cuda-tile
+: 1767079237:0;cd ..
+: 1767079241:0;bash scripts/build_cuda_tile.sh
+: 1767079278:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm/lib/cmake/llvm
+: 1767079304:0;lsa
+: 1767079308:0;chown -R scotty:scotty ./
+: 1767079311:0;cd third_party
+: 1767079314:0;cd llvm
+: 1767079315:0;lsa
+: 1767079317:0;cd ..
+: 1767079320:0;rm -rf llvm
+: 1767079323:0;lsa
+: 1767079347:0;mv llvm-project/third_party/llvm ./
+: 1767079362:0;rm -rf llvm-project/third_party
+: 1767079562:0;cd ..
+: 1767079563:0;lsa
+: 1767079571:0;bash scripts/build_deps.sh
+: 1767079594:0;cd third_party/llvm-project
+: 1767079595:0;lsa
+: 1767079650:0;bash scripts/build_deps.sh
+: 1767079652:0;cd ..
+: 1767079656:0;bash scripts/build_deps.sh
+: 1767079723:0;cd third_party/llvm-project
+: 1767079727:0;...
+: 1767079728:0;lsa
+: 1767079754:0;bash scripts/build_deps.sh
+: 1767079770:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project
+: 1767079780:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project/build
+: 1767079887:0;lsa
+: 1767079893:0;cd third_party/llvm-project
+: 1767079894:0;lsa
+: 1767079916:0;cd ..
+: 1767079919:0;bash scripts/build_deps.sh
+: 1767080057:0;ld --version
+: 1767080075:0;bash scripts/build_deps.sh
+: 1767080192:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project/third_party/llvm-project/build
+: 1767080196:0;bash scripts/build_deps.sh
+: 1767082332:0;bash scripts/build_cuda_tile.sh
+: 1767082420:0;find ./ -name "libmlir_cuda_runtime.*"
+: 1767082447:0;cd explore
+: 1767082448:0;lsa
+: 1767082498:0;export MLIR_CUDA_RUNTIME=../../cuda-tile/third_party/llvm/lib/libmlir_cuda_runtime.so
+: 1767082510:0;find ../ -name "libmlir_cuda_runtime.*"
+: 1767082537:0;find ../ -name "libmlir_runner_utils.*"
+: 1767082559:0;export MLIR_RUNNER_UTILS=../third_party/llvm/lib/libmlir_runner_utils.so
+: 1767082573:0;mlir-opt vec_add_gpu.mlir \\
+  | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin" \\
+  | mlir-runner \\
+      --shared-libs=$MLIR_CUDA_RUNTIME \\
+      --shared-libs=$MLIR_RUNNER_UTILS \\
+      --entry-point-result=void
+: 1767082681:0;bash run.sh
+: 1767082780:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && ls -la && echo '---' && command -v cuda-tile-opt || true && command -v mlir-opt || true && command -v mlir-cpu-runner || true && command -v mlir-runner || true
+: 1767082790:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && pwd && ls -უ��უ�� && echo '---' && ../third_party/llvm/bin/mlir-opt --version && echo '---' && ../third_party/llvm/bin/mlir-opt --help | grep -E "(print|memref)" | head -n 50 && echo '---' && ../third_party/llvm/bin/mlir-runner --help | head -n 80
+: 1767082798:0;echo TEST && uname -a && ls -la /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore | head
+: 1767082874:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && bash -lc 'set -euo pipefail; bash run.sh'
+: 1767082911:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && ../third_party/llvm/bin/mlir-runner --help | head -n 120
+: 1767083120:0;cd explore
+: 1767083123:0;../third_party/llvm/bin/mlir-opt gpu.mlir
+: 1767083341:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin"
+: 1767083444:0;../third_party/llvm/bin/mlir-opt gpu.mlir \\
+  | ../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin"
+: 1767083480:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin" gpu.mlir
+: 1767084870:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_120" gpu.mlir
+: 1767085381:0;../third_party/llvm/bin/mlir-opt gpu.mlir
+: 1767085388:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_120" gpu.mlir
+: 1767085453:0;bash run.sh
+: 1767085885:0;../third_party/llvm/bin/mlir-opt gpu.mlir
+: 1767085962:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_120" gpu.mlir
+: 1767086084:0;file gpu.mlir\
+head -n 3 gpu.mlir | cat -A\
+xxd -g 1 -l 16 gpu.mlir\
+
+: 1767086180:0;../third_party/llvm/bin/mlir-opt --version
+: 1767086193:0;cd llvm-project
+: 1767086198:0;-
+: 1767086233:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_80" gpu.mlir
+: 1767086346:0;../third_party/llvm/bin/mlir-opt --gpu-to-llvm gpu.mlir
+: 1767086602:0;bash run.sh
+: 1767087204:0;../third_party/llvm/bin/mlir-translate example-nvvm.mlir        \\
+  --mlir-to-llvmir                      \\
+  -o example.ll
+: 1767087684:0;../third_party/llvm/bin/mlir-opt --show-dialect 
+: 1767087688:0;../third_party/llvm/bin/mlir-opt --show-dialects 
+: 1767087753:0;bash run.sh
+: 1767088328:0;../third_party/llvm/bin/mlir-opt --help | grep -n "lower-host-to-llvm"
+: 1767088361:0;bash run.sh
+: 1767089170:0;history
+: 1767089551:0;bash run.sh
+: 1767092289:0;../third_party/llvm/bin/mlir-opt --help | grep -E "lower-host-to-llvm|convert-memref-to-llvm|finalize-memref-to-llvm" -n
+: 1767092365:0;../third_party/llvm/bin/mlir-opt gpu.mlir \\
+  --pass-pipeline="builtin.module(\
+    nvvm-attach-target{chip=sm_80 O=3},\
+    gpu.module(convert-gpu-to-nvvm),\
+    gpu-module-to-binary,\
+    lower-host-to-llvm\
+  )" \\
+  -print-ir-after-all -verify-each 2>&1 | grep -n "unrealized_conversion_cast" | head -n 50
+: 1767092396:0;bash run.sh
+: 1767092722:0;../third_party/llvm/bin/mlir-opt --help | grep -nE "(^| )--lower-to-llvm( |$)|lower-to-llvm"\
+../third_party/llvm/bin/mlir-opt --help | grep -nE "convert-func-to-llvm|convert-cf-to-llvm|convert-arith-to-llvm"\
+
+: 1767092809:0;bash run.sh
+: 1767094172:0;cd llvm-project
+: 1767094294:0;-
+: 1767094298:0;bash run.sh
+: 1767148594:0;cd /usr/local/cuda/lib64 
+: 1767148595:0;lsa
+: 1767148605:0;cd cmake 
+: 1767148606:0;lsa
+: 1767148614:0;cd libcudacxx
+: 1767148615:0;lsa
+: 1767148622:0;cat libcudacxx-config.cmake
+: 1767152044:0;git stash push 
+: 1767152135:0;git log 
+: 1767152965:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767152982:0;./build/Toy/toy-cuda sample/matmul.toy
+: 1767153492:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767153543:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir
+: 1767153581:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt
+: 1767153607:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir-affine
+: 1767153730:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767153792:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt
+: 1767153800:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir-affine -opt
+: 1767153884:0;git stash push 
+: 1767153914:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767153986:0;git stash pop
+: 1767154234:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767154356:0;./build/Toy/toy-cuda sample/matmul.toy -emit=llvm --mlir-print-ir-after-all | tee a.log
+: 1767154374:0;./build/Toy/toy-cuda sample/matmul.toy -emit=llvm --mlir-print-ir-after-all 2>&1 | tee a.log
+: 1767156049:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit
+: 1767156506:0;git stash pop
+: 1767163424:0;./third_party/llvm/bin/mlir-translate --mlir-to-llvmir explore/extern_fun.mlir > fun.ll
+: 1767163510:0;clang -O2 fun.ll -Lbuild/Toy -lcuda_shim -lcudart_static -ldl -pthread -o fun
+: 1767185426:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir
+: 1767186892:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt
+: 1767187900:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir 
+: 1767190577:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt
+: 1767233188:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir
+: 1767233201:0;./build/Toy/toy-cuda sample/gpu.mlir -emit=mlir
+: 1767238309:0;./build/Toy/toy-cuda --help
+: 1767238334:0;./build/Toy/toy-cuda --help G grid
+: 1767238439:0;./build/Toy/toy-cuda sample/gpu.mlir --grid=1,2,1
+: 1767238499:0;./build/Toy/toy-cuda sample/gpu.mlir
+: 1767239646:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu
+: 1767239659:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir
+: 1767259878:0;apt update -y && apt install -yq gdb 
+: 1767262830:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir
+: 1767262886:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir
+: 1767262901:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt
+: 1767263012:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir
+: 1767263016:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt
+: 1767263078:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir
+: 1767263439:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir -deubg=toy-gpu-outline
+: 1767263450:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --deubg=toy-gpu-outline
+: 1767263459:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug=toy-gpu-outline
+: 1767263474:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug
+: 1767314974:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu
+: 1767314983:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug
+: 1767315241:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug=toy-gpu-outline
+: 1767315260:0;./build/Toy/toy-cuda --help G debug
+: 1767315294:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug
+: 1767316318:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir
+: 1767317299:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1
+: 1767317548:0;git push --force-with-lease 
+: 1767318062:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1
+: 1767318405:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir-affine -opt
+: 1767318476:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1
+: 1767318771:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-affine --grid=4,2,1
+: 1767340440:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1
+: 1767340896:0;cd third_party/cuda
+: 1767340905:0;find ../ -name "lib*.a"
+: 1767340913:0;find ./ -name "lib*.a"
+: 1767341975:0;cd ..
+: 1767341978:0;cd cuda-tile
+: 1767341978:0;lsa
+: 1767341985:0;...
+: 1767341988:0;chown -R scotty:scotty ./
+: 1767341993:0;-
+: 1767341995:0;lsa
+: 1767341998:0;cd build
+: 1767342000:0;cmake --install .
+: 1767342180:0;cd ..
+: 1767342185:0;lsa
+: 1767342192:0;cd ..
+: 1767342231:0;bash scripts/build_cuda_tile.sh
+: 1767342672:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/cuda-tile/build/include
+: 1767342681:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/cuda-tile/build/include/cuda_tile/Dialect/CudaTile/IR/Dialect.h
+: 1767343260:0;-
+: 1767343262:0;..
+: 1767343263:0;lsa
+: 1767343268:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1
+: 1767343276:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug
+: 1767349256:0;cd examples
+: 1767349259:0;cd map2d
+: 1767349259:0;lsa
+: 1767349262:0;bash build_app.sh
+: 1767349264:0;lsa
+: 1767349282:0;${WORKDIR}/build/bin/cuda-tile-opt example.mlir --mlir-print-ir-after-all -cse
+: 1767349292:0;../..//build/bin/cuda-tile-opt example.mlir --mlir-print-ir-after-all -cse
+: 1767349432:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug
+: 1767401976:0;lsa
+: 1767401985:0;history
+: 1767402311:0;cp ~/.zsh_history vscode/container_zsh_history
diff --git a/mlir/cuda-tile/vscode/.initial_container.sh b/mlir/cuda-tile/vscode/.initial_container.sh
new file mode 100644
index 0000000..a8555fa
--- /dev/null
+++ b/mlir/cuda-tile/vscode/.initial_container.sh
@@ -0,0 +1,9 @@
+docker run -d --gpus all \
+  --privileged -ti \
+  --cap-add=SYS_ADMIN --cap-add=SYS_PTRACE \
+  --shm-size 4G \
+  --ulimit memlock=-1:-1 \
+  --security-opt seccomp=unconfined --ipc=host \
+  -v $PWD:/work -w /work \
+  nvidia/cuda:13.0.0-devel-ubuntu22.04 bash
+  # bash -lc 'nsys --version && nsys profile --trace=cuda,nvtx,osrt --stats=true -o sysrep ./matmul'
diff --git a/mlir/cuda-tile/vscode/.zsh_history b/mlir/cuda-tile/vscode/.zsh_history
new file mode 100644
index 0000000..0b08425
--- /dev/null
+++ b/mlir/cuda-tile/vscode/.zsh_history
@@ -0,0 +1,267 @@
+: 1766810530:0;bash
+: 1766822431:0;cat ~/.zshrc
+: 1766822464:0;cat ~/.oh-my-zsh/custom/themes/grape.zsh-theme
+: 1766822525:0;rm -rf ~/gitstatus
+: 1766822564:0;lsa
+: 1766830123:0;./example
+: 1766830127:0;lsa
+: 1766830193:0;mv example example.tilebc Desktop
+: 1766830195:0;cd Desktop
+: 1766830196:0;lsa
+: 1766830203:0;./example
+: 1766837019:0;mkdir ~/.ssh
+: 1766837033:0;vim ~/ssh/authroized_keys
+: 1766837064:0;vim ~/.ssh/authroized_keys
+: 1766837086:0;lsa .ssh
+: 1766837123:0;chmod 700 ~/.ssh
+: 1766837126:0;lsa .ssh
+: 1766837160:0;cd .ssh
+: 1766837163:0;mv authroized_keys authorized_keys
+: 1766837177:0;lsa
+: 1766838186:0;cd Desktop
+: 1766838187:0;lsa
+: 1766838192:0;rm -rf example_with_2inputs
+: 1766838217:0;lsa
+: 1766896433:0;cd Desktop
+: 1766896438:0;cuobjdump example.cubin
+: 1766896443:0;example.cubin
+: 1766899405:0;vim /etc/systemd/resolved.conf
+: 1766899421:0;sudo vim /etc/systemd/resolved.conf
+: 1766899453:0;systemctl restart systemd-resolved.service
+: 1766899621:0;ping www.google.com
+: 1766901388:0;sudo reboot 
+: 1766901493:0;ping www.google.com
+: 1766901518:0;docker pull alwaysproblem/fastdev-u2204:zsh
+: 1766901899:0;cd Desktop
+: 1766901901:0;lsa
+: 1766901944:0;git clone https://github.com/llvm/llvm-project
+: 1766902277:0;docker pull nvidia/cuda:13.1.0-devel-ubuntu22.04
+: 1766902571:0;docker pull alwaysproblem/fastdev-u2204:nv13.1.0
+: 1766902588:0;lsa
+: 1766902611:0;btm
+: 1766902673:0;lsa
+: 1766902687:0;mkdir dockerVolumn
+: 1766902688:0;lsa
+: 1766902725:0;docker run --gpus all --privileged -ti --net=host --shm-size 4G --ulimit memlock=-1:-1 -d -it --name yyx-cuda-ir  -v `pwd`:/root/Desktop/dockerVolumn alwaysproblem/fastdev-u2204:nv13.1.0 /bin/bash
+: 1766902780:0;nvidia-smi
+: 1766902796:0;bash 
+: 1766902817:0;lsmod | grep -E '^nvidia|nouveau'
+: 1766902822:0;sudo dmesg | grep -iE 'nvrm|nvidia|nouveau' | tail -n 80\
+
+: 1766902840:0;sudo modprobe nvidia
+: 1766902847:0;sudo modprobe nvidia_uvm
+: 1766902851:0;sudo modprobe nvidia_drm
+: 1766902865:0;mokutil --sb-state
+: 1766902876:0;sudo reboot
+: 1766902915:0;nvidia-smi
+: 1766902941:0;lsmod | grep -E '^nvidia|nouveau'\
+
+: 1766902958:0;sudo apt update
+: 1766902982:0;sudo apt install -y build-essential dkms linux-headers-$(uname -r)
+: 1766902997:0;dkms status
+: 1766903014:0;which nvidia-smi
+: 1766903019:0;dpkg -l | grep -E 'nvidia|libnvidia' | head
+: 1766903031:0;modinfo nvidia | head
+: 1766903107:0;nvidia-smi
+: 1766903134:0;sudo dmesg | grep -iE 'nvrm|nvidia|nouveau|secure|dkms' | tail -n 120
+: 1766903150:0;nvidia-smi\
+
+: 1766903155:0;mokutil --sb-state\
+
+: 1766903160:0;lsmod | grep -E '^nvidia|nouveau'\
+
+: 1766903168:0;modinfo nvidia || echo "nvidia module not found"
+: 1766903175:0;sudo modprobe nvidia || echo "modprobe nvidia failed"
+: 1766903186:0;sudo dmesg | grep -iE 'nvrm|nvidia|nouveau|secure|dkms' | tail -n 120
+: 1766903232:0;sudo nvidia-smi 
+: 1766903398:0;dkms status
+: 1766903410:0;sudo dkms autoinstall\
+
+: 1766903420:0;sudo dkms build nvidia/590.48.01 -k $(uname -r)
+: 1766903448:0;sudo dkms install nvidia/590.48.01 -k $(uname -r)
+: 1766903459:0;ls /lib/modules/$(uname -r)/updates/dkms | grep nvidia\
+
+: 1766903467:0;sudo modprobe nvidia
+: 1766903472:0;sudo modprobe nvidia_uvm
+: 1766903477:0;sudo modprobe nvidia_drm
+: 1766903482:0;nvidia-smi\
+
+: 1766903516:0;cd Desk
+: 1766903518:0;lsa
+: 1766903525:0;mv llvm-project dockerVolumn
+: 1766903529:0;docker run --gpus all --privileged -ti --net=host --shm-size 4G --ulimit memlock=-1:-1 -d -it --name yyx-cuda-ir  -v `pwd`:/root/Desktop/dockerVolumn alwaysproblem/fastdev-u2204:nv13.1.0 /bin/bash
+: 1766903536:0;docker rm -f /yyx-cuda-ir
+: 1766903537:0;docker run --gpus all --privileged -ti --net=host --shm-size 4G --ulimit memlock=-1:-1 -d -it --name yyx-cuda-ir  -v `pwd`:/root/Desktop/dockerVolumn alwaysproblem/fastdev-u2204:nv13.1.0 /bin/bash
+: 1766924647:0;lsa
+: 1766924666:0;git clone https://github.com/NVIDIA/cuda-tile.git
+: 1766924671:0;cd cuda-tile
+: 1766924672:0;lsa
+: 1766924703:0;docker exec -ti yyx-cuda-ir zsh 
+: 1766926364:0;lsa
+: 1766926373:0;cd dockerV
+: 1766926379:0;cd Desktop/dockerVolumn
+: 1766926381:0;lsa
+: 1766926395:0;git clone https://github.com/Alwaysproblem/MLcompiler-tutorial
+: 1766926544:0;vim ~/.ssh/authroized_keys
+: 1766926555:0;vim ~/.ssh/authorized_keys
+: 1766926879:0;cd dockerV
+: 1766926880:0;lsa
+: 1766927052:0;cd cuda-tile
+: 1766927052:0;lsa
+: 1766927063:0;cp -R ../llvm-project 3rdparty
+: 1766927109:0;cd Desktop/dockerVolumn
+: 1766927110:0;lsa
+: 1766927113:0;cd ..
+: 1766927114:0;lsa
+: 1766927171:0;rm -rf cuda-tile/ llvm-project/ MLcompiler-tutorial
+: 1766927174:0;lsa
+: 1766928551:0;cd examples
+: 1766928551:0;lsa
+: 1766928554:0;cd map
+: 1766928554:0;lsa
+: 1766928558:0;./example
+: 1766928677:0;lsa
+: 1766928680:0;./example
+: 1766931383:0;cd ../map2d
+: 1766931387:0;./example
+: 1766932138:0;git push 
+: 1766971092:0;cd map2d
+: 1766971096:0;./example
+: 1766978487:0;cd ../matmul
+: 1766978489:0;./example
+: 1766983673:0;ls /usr/local/cuda/bin | grep -E 'nsys|ncu'
+: 1766983777:0;/usr/local/cuda/bin/ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766983786:0;lsa
+: 1766983797:0;sudo chown -R cheng:cheng ./
+: 1766983806:0;/usr/local/cuda/bin/ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766983821:0;lsa
+: 1766983914:0;/usr/local/cuda/bin/nsys profile --trace=cuda --stats=true -o sysrep ./example.cpp
+: 1766983925:0;/usr/local/cuda/bin/nsys profile --trace=cuda --stats=true -o sysrep ./example
+: 1766983939:0;lsa
+: 1766984070:0;nvidia-smi -l 1\
+
+: 1766984097:0;ldd ./matmul | grep -E 'libcuda|libcudart|libnvrtc' || true
+: 1766984104:0;ldd ./example | grep -E 'libcuda|libcudart|libnvrtc' || true
+: 1766984126:0;strace -f -e trace=openat,access -o /tmp/trace.log ./matmul 2>/dev/null || true 
+: 1766984132:0;strace -f -e trace=openat,access -o /tmp/trace.log ./example 2>/dev/null || true 
+: 1766984141:0;grep -nE 'libcuda|libcudart|nvidia|/dev/nvidia|nvrtc|ptxjit' /tmp/trace.log | head -n 80
+: 1766984195:0;nsys --version
+: 1766984210:0;ls -l /usr/local/cuda/extras/CUPTI/lib64/libcupti.so* 2>/dev/null || true
+: 1766984218:0;ldconfig -p | grep -i cupti || true
+: 1766984299:0;sudo apt install -y nsight-systems
+: 1766984319:0;sudo apt install -y nsight-systems-2025.5.2
+: 1766984348:0;nsys --version
+: 1766984370:0;nsys profile --trace=cuda --stats=true -o sysrep ./example
+: 1766984516:0;apt-cache search nsight | head -n 50
+: 1766984595:0;vim ｃ�/.zshrc
+: 1766984608:0;vim ~/.zshrc
+: 1766984633:0;ncu --version 
+: 1766984673:0;nsys --version 
+: 1766984840:0;nvidia-smi 
+: 1766984958:0;lsa
+: 1766984965:0;ps -a G nv
+: 1766984969:0;ps -a G example
+: 1766984974:0;nvidia-smi 
+: 1766985007:0;sudo reboot
+: 1766985235:0;nvidia-smi 
+: 1766985298:0;lsa
+: 1766985304:0;cd matmul
+: 1766985305:0;lsa
+: 1766985308:0;docker image ls
+: 1766985323:0;docker ps -a
+: 1766985329:0;docker start yyx-cuda-ir
+: 1766985334:0;lsa
+: 1766985344:0;rm -rf sysrep.* 
+: 1766985347:0;lsa
+: 1766985354:0;vim ~/.zshrc
+: 1766985383:0;apt install nsight-compute
+: 1766985393:0;sudo apt install -y nsight-compute
+: 1766985407:0;sudo apt install -y nsight-compute-2025.4.0
+: 1766985464:0;ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766985493:0;lsa
+: 1766985521:0;sudo nvidia-smi -pm 1
+: 1766985527:0;sudo nvidia-smi -i 0 -rgc
+: 1766985532:0;sudo nvidia-smi -i 0 -rci\
+
+: 1766985552:0;sudo tee /etc/modprobe.d/nvidia-prof.conf << 'EOF'\
+options nvidia NVreg_RestrictProfilingToAdminUsers=0\
+EOF
+: 1766985559:0;sudo update-initramfs -u
+: 1766985608:0;sudo reboot
+: 1766985646:0;cd matmul
+: 1766985664:0;ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766985677:0;nvidia-smi 
+: 1766985697:0;sudo apt install -y nsight-compute-2025.4.0
+: 1766985752:0;zsh: command not found: ncu
+: 1766985759:0;sudo find / -type f -name ncu -o -name ncu-cli 2>/dev/null | head -n 50
+: 1766985817:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766985837:0;vim ~/.zshrc
+: 1766985849:0;lsa
+: 1766987050:0;sudo chown -R cheng:cheng ./
+: 1766987054:0;lsa
+: 1766987067:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766987073:0;lsa
+: 1766987090:0;mv matmul_kernel.ncu-rep matmul_kernel.ncu-rep.baseline
+: 1766987092:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766987987:0;mv matmul_kernel.ncu-rep matmul_kernel.ncu-rep.token
+: 1766987988:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example
+: 1766988180:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel -f ./example
+: 1767007315:0;lsa
+: 1767007318:0;sudo chown -R cheng:cheng ./
+: 1767007349:0;git statsu
+: 1767007352:0;git status
+: 1767007879:0;cd ../CrossTileBlockCommunication
+: 1767007882:0;./example
+: 1767008718:0;lsa
+: 1767008721:0;cd ..
+: 1767008723:0;sudo chown -R cheng:cheng ./
+: 1767009097:0;cd tf32
+: 1767009099:0;./example
+: 1767009274:0;lsa
+: 1767009280:0;./example
+: 1767009377:0;cd ..
+: 1767009379:0;cd bf16
+: 1767009381:0;lsa
+: 1767009383:0;./example
+: 1767010202:0;lsa
+: 1767010207:0;..
+: 1767010208:0;lsa
+: 1767010210:0;c...
+: 1767010213:0;...
+: 1767010214:0;lsa
+: 1767088913:0;cd cuda-tile
+: 1767088916:0;cd ..
+: 1767088919:0;cd dockerVolumn
+: 1767088923:0;cd ..
+: 1767088924:0;lsa
+: 1767088933:0;cd cuda-tile
+: 1767088934:0;lsa
+: 1767088936:0;cd ..
+: 1767088937:0;cd MLcompiler-tutorial
+: 1767088942:0;cd mlir/cuda-tile
+: 1767089068:0;../third_party/llvm/bin/mlir-runner example-nvvm.mlir \\
+  --entry-point-result=void \\
+  --shared-libs=${MLIR_RUNNER_UTILS} \\
+  --shared-libs=${MLIR_CUDA_RUNTIME}
+: 1767089081:0;cd explore
+: 1767089083:0;export MLIR_RUNNER_UTILS=`pwd`/../third_party/llvm/lib/libmlir_runner_utils.so\
+export MLIR_CUDA_RUNTIME=`pwd`/../third_party/llvm/lib/libmlir_cuda_runtime.so
+: 1767089086:0;../third_party/llvm/bin/mlir-runner example-nvvm.mlir \\
+  --entry-point-result=void \\
+  --shared-libs=${MLIR_RUNNER_UTILS} \\
+  --shared-libs=${MLIR_CUDA_RUNTIME}
+: 1767089105:0;apt install g++13 
+: 1767089123:0;sudo apt update -y && apt install g++-13 gcc-13
+: 1767089152:0;sudo apt update -y && sudo apt install g++-13 gcc-13
+: 1767089182:0;sudo apt install -yq software-properties-common
+: 1767089214:0;sudo apt update -y && sudo apt install g++-13 gcc-13
+: 1767089237:0;sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+: 1767089432:0;sudo apt update -y && sudo apt install g++-13 gcc-13
+: 1767089478:0;../third_party/llvm/bin/mlir-runner example-nvvm.mlir \\
+  --entry-point-result=void \\
+  --shared-libs=${MLIR_RUNNER_UTILS} \\
+  --shared-libs=${MLIR_CUDA_RUNTIME}
+: 1767401874:0;lsa
+: 1767401877:0;docker image ls
+: 1767402034:0;history 
+: 1767402502:0;cp ~/.zsh_history ~/Desktop/MLcompiler-tutorial/mlir/cuda-tile/vscode
diff --git a/mlir/cuda-tile/vscode/c_cpp_properties.json b/mlir/cuda-tile/vscode/c_cpp_properties.json
new file mode 100644
index 0000000..6cdddd8
--- /dev/null
+++ b/mlir/cuda-tile/vscode/c_cpp_properties.json
@@ -0,0 +1,17 @@
+{
+  "configurations": [
+      {
+          "name": "Linux",
+          "includePath": [],
+          "defines": [],
+          "compilerPath": "/usr/bin/gcc",
+          "cStandard": "c11",
+          "cppStandard": "c++17",
+          "intelliSenseMode": "linux-gcc-x64",
+          "configurationProvider": "ms-vscode.cmake-tools",
+          "mergeConfigurations": true,
+          "compileCommands": "${workspaceFolder}/build/compile_commands.json"
+      }
+  ],
+  "version": 4
+}
diff --git a/mlir/cuda-tile/vscode/cmake-kits.json b/mlir/cuda-tile/vscode/cmake-kits.json
new file mode 100644
index 0000000..da177bf
--- /dev/null
+++ b/mlir/cuda-tile/vscode/cmake-kits.json
@@ -0,0 +1,13 @@
+[
+  {
+    "name": "GCC in conda",
+    "compilers": {
+      "C": "/usr/bin/gcc",
+      "CXX": "/usr/bin/g++"
+    },
+    "environmentSetupScript": "${workspaceFolder}/.envsetup.sh",
+    "preferredGenerator": {
+      "name": "Ninja"
+    }
+  }
+]
diff --git a/mlir/cuda-tile/vscode/launch.json b/mlir/cuda-tile/vscode/launch.json
new file mode 100644
index 0000000..be1aeed
--- /dev/null
+++ b/mlir/cuda-tile/vscode/launch.json
@@ -0,0 +1,33 @@
+{
+  // 使用 IntelliSense 了解相关属性。
+  // 悬停以查看现有属性的描述。
+  // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "(gdb) 启动",
+      "type": "cppdbg",
+      "request": "launch",
+      "program": "${command:cmake.launchTargetPath}",
+      "args": [],
+      "stopAtEntry": false,
+      "cwd": "${workspaceFolder}",
+      "environment": [],
+      "externalConsole": false,
+      "MIMode": "gdb",
+      "miDebuggerPath": "/usr/bin/gdb",
+      "setupCommands": [
+          {
+              "description": "为 gdb 启用整齐打印",
+              "text": "-enable-pretty-printing",
+              "ignoreFailures": true
+          },
+          {
+              "description": "将反汇编风格设置为 Intel",
+              "text": "-gdb-set disassembly-flavor intel",
+              "ignoreFailures": true
+          }
+      ]
+    }
+  ]
+}
diff --git a/mlir/cuda-tile/vscode/settings.json b/mlir/cuda-tile/vscode/settings.json
new file mode 100644
index 0000000..58237aa
--- /dev/null
+++ b/mlir/cuda-tile/vscode/settings.json
@@ -0,0 +1,118 @@
+{
+  "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
+  "cmake.debugConfig": {
+      "cwd": "${workspaceFolder}",
+      "args": [
+      ]
+  },
+
+  "cmake.cmakePath": "/root/miniconda3/envs/mlir/bin/cmake",
+  "files.associations": {
+    "*.py": "python",
+    "*.mmd": "mermaid",
+    "*.dockfile": "dockerfile",
+    ".style.yapf": "ini",
+    "*.inc": "cpp",
+    "array": "cpp",
+    "atomic": "cpp",
+    "bit": "cpp",
+    "*.tcc": "cpp",
+    "bitset": "cpp",
+    "cctype": "cpp",
+    "chrono": "cpp",
+    "cinttypes": "cpp",
+    "clocale": "cpp",
+    "cmath": "cpp",
+    "complex": "cpp",
+    "condition_variable": "cpp",
+    "cstdarg": "cpp",
+    "cstddef": "cpp",
+    "cstdint": "cpp",
+    "cstdio": "cpp",
+    "cstdlib": "cpp",
+    "cstring": "cpp",
+    "ctime": "cpp",
+    "cwchar": "cpp",
+    "cwctype": "cpp",
+    "deque": "cpp",
+    "forward_list": "cpp",
+    "list": "cpp",
+    "map": "cpp",
+    "set": "cpp",
+    "unordered_map": "cpp",
+    "unordered_set": "cpp",
+    "vector": "cpp",
+    "exception": "cpp",
+    "algorithm": "cpp",
+    "functional": "cpp",
+    "iterator": "cpp",
+    "memory": "cpp",
+    "memory_resource": "cpp",
+    "numeric": "cpp",
+    "optional": "cpp",
+    "random": "cpp",
+    "ratio": "cpp",
+    "string": "cpp",
+    "string_view": "cpp",
+    "system_error": "cpp",
+    "tuple": "cpp",
+    "type_traits": "cpp",
+    "utility": "cpp",
+    "fstream": "cpp",
+    "future": "cpp",
+    "initializer_list": "cpp",
+    "iomanip": "cpp",
+    "iosfwd": "cpp",
+    "iostream": "cpp",
+    "istream": "cpp",
+    "limits": "cpp",
+    "mutex": "cpp",
+    "new": "cpp",
+    "ostream": "cpp",
+    "shared_mutex": "cpp",
+    "sstream": "cpp",
+    "stdexcept": "cpp",
+    "streambuf": "cpp",
+    "thread": "cpp",
+    "typeinfo": "cpp",
+    "variant": "cpp",
+    "compare": "cpp",
+    "concepts": "cpp",
+    "numbers": "cpp",
+    "semaphore": "cpp",
+    "stop_token": "cpp",
+    "any": "cpp",
+    "executor": "cpp",
+    "netfwd": "cpp"
+  },
+  "cmake.configureArgs": [
+    "-Wno-dev",
+
+    // This is for non-conda users.
+    // "-DMLIR_DIR=${workspaceFolder}/third_party/llvm/lib/cmake/mlir",
+    // "-DLLVM_DIR=${workspaceFolder}/third_party/llvm/lib/cmake/llvm",
+    // "-DMHLO_DIR=${workspaceFolder}/third_party/mhlo/lib/cmake/mlir-hlo",
+    // "-DCMAKE_MODULE_PATH=${workspaceFolder}/third_party/llvm/lib/cmake/mlir;${workspaceFolder}/third_party/llvm/lib/cmake/llvm;${workspaceFolder}/third_party/mhlo/lib/cmake/mlir-hlo",
+    // "-DMLIR_TABLEGEN_EXE=${workspaceFolder}/third_party/llvm/bin/mlir-tblgen",
+
+    // This is for conda users.
+    "-DMLIR_TABLEGEN_EXEUTABLE:FILEPATH=/root/miniconda3/envs/mlir/bin/mlir-tblgen",
+    "-DCMAKE_MODULE_PATH=/root/miniconda3/envs/mlir/lib/cmake/mlir;/root/miniconda3/envs/mlir/lib/cmake/llvm",
+  ],
+  // "cmake.environment": {
+  //   "LD_LIBRARY_PATH": "/root/miniconda3/envs/mlir/x86_64-conda-linux-gnu/lib:${env.LD_LIBRARY_PATH}"
+  // },
+  "C_Cpp.clang_format_path": "${env.HOME}/miniconda3/envs/mlir/bin/clang-format",
+  "C_Cpp.codeAnalysis.clangTidy.path": "${env.HOME}/miniconda3/envs/mlir/bin/clang-tidy",
+  "cmakeFormat.exePath": "/root/miniconda3/envs/mlir/bin/cmake-format",
+  "C_Cpp.errorSquiggles": "enabled",
+  "C_Cpp.clang_format_sortIncludes": true,
+  "C_Cpp.codeAnalysis.clangTidy.enabled": true,
+  "C_Cpp.codeAnalysis.clangTidy.codeAction.formatFixes": true,
+  "C_Cpp.codeAnalysis.clangTidy.useBuildPath": true,
+  "C_Cpp.codeAnalysis.clangTidy.args": [
+    "-p",
+    "${workspaceFolder}/build/compile_commands.json"
+  ],
+  "cmakeFormat.args": ["--config=${workspaceFolder}/.cmake-lint.yaml"],
+}