diff --git a/mlir/cuda-tile/.clang-format b/mlir/cuda-tile/.clang-format new file mode 100644 index 0000000..34ac64a --- /dev/null +++ b/mlir/cuda-tile/.clang-format @@ -0,0 +1,5 @@ +BasedOnStyle: LLVM +LineEnding: LF +IndentWidth: 2 +TabWidth: 2 +UseTab: Never diff --git a/mlir/cuda-tile/.devcontainer/Dockerfile b/mlir/cuda-tile/.devcontainer/Dockerfile new file mode 100644 index 0000000..5ffc399 --- /dev/null +++ b/mlir/cuda-tile/.devcontainer/Dockerfile @@ -0,0 +1,41 @@ +FROM alwaysproblem/fastdev-u2204:nv13.1.0 + +ARG UID=1000 +ARG GID=1000 + +RUN echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" > /etc/apt/sources.list.d/llvm.list \ + && echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy main" >> /etc/apt/sources.list.d/llvm.list \ + && echo "# 20" >> /etc/apt/sources.list.d/llvm.list \ + && echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list.d/llvm.list \ + && echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-20 main" >> /etc/apt/sources.list.d/llvm.list \ + && echo "# 21" >> /etc/apt/sources.list.d/llvm.list \ + && echo "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-21 main" >> /etc/apt/sources.list.d/llvm.list \ + && echo "deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-21 main" >> /etc/apt/sources.list.d/llvm.list \ + && wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc \ + && apt update -y && \ + apt install -y \ + python3 python3-dev python3-setuptools python3-pip \ + libtinfo-dev zlib1g-dev \ + build-essential cmake ninja-build \ + clang-20 clang-tidy-20 clangd-20 cmake-format \ + clang-format-20 lldb-20 lld-20 libfmt-dev libspdlog-dev \ + && apt clean -y && rm -rf /var/lib/apt/lists/* \ + && update-alternatives --install /usr/bin/clang clang /usr/bin/clang-20 100 \ + && update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-20 100 \ + && update-alternatives --install /usr/bin/clangd clangd /usr/bin/clangd-20 100 \ + && update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-20 100 \ + && update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-20 100 \ + && update-alternatives --install /usr/bin/lld lld /usr/bin/lld-20 100 \ + && update-alternatives --install /usr/bin/lldb lldb /usr/bin/lldb-20 100 + +RUN apt update -y && apt install -yq software-properties-common \ + && add-apt-repository -y ppa:ubuntu-toolchain-r/test \ + && apt update -yq \ + && apt install -yq gcc-13 g++-13 gdb \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 \ + && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 100 \ + && apt clean -y && rm -rf /var/lib/apt/lists/* + +RUN git config --global --add safe.directory '*' && \ + /root/.local/bin/setup_new_user ${UID} ${GID} && \ + python3 -m pip install pre-commit compdb diff --git a/mlir/cuda-tile/.devcontainer/devcontainer.json b/mlir/cuda-tile/.devcontainer/devcontainer.json new file mode 100644 index 0000000..927324f --- /dev/null +++ b/mlir/cuda-tile/.devcontainer/devcontainer.json @@ -0,0 +1,77 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda +{ + "remoteUser": "root", + "name": "mlir-example", + "workspaceMount": "source=${localWorkspaceFolder},target=${localWorkspaceFolder}/../../../MLcompiler-tutorial/mlir/${localWorkspaceFolderBasename},type=bind", + "workspaceFolder": "/root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/${localWorkspaceFolderBasename}", + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "Dockerfile", + "options": [ + "--network=host" + ], + "args": { + "UID": "1000", + "GID": "1000" + } + }, + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "python --version", + // Configure tool-specific properties. + // "customizations": {}, + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" + "privileged": true, + // "capAdd": ["SYS_PTRACE"], + "mounts": [ + { + "source": "${localWorkspaceFolder}/../../../", + "target": "/root/Desktop/dockerVolumn", + "type": "bind" + } + ], + "runArgs": [ + // "--cap-add=SYS_PTRACE", + // "--security-opt", + // "seccomp=unconfined", + "--name", + // "${localEnv:USER}-tvm", + "yyx-cuda-tile", + // "-v", + // "/data/rech/yongxiy/Desktop/dockerVolumn:/root/Desktop/dockerVolumn" + ], + "customizations": { + "vscode": { + "extensions": [ + "jeff-hykin.better-cpp-syntax", + "aaron-bond.better-comments", + "ms-vscode.cpptools-themes", + "revng.llvm-ir", + "jakob-erzar.llvm-tablegen", + "MomenAbdelkarim-WyattCalandro-LuisPrieto.mlir", + "ms-vscode.cpptools", + "ms-vscode.cpptools-extension-pack", + "twxs.cmake", + "josetr.cmake-language-support-vscode", + "ms-vscode.cmake-tools", + "cheshirekow.cmake-format", + "yzhang.markdown-all-in-one", + "bierner.markdown-preview-github-styles", + "bierner.markdown-mermaid", + "DavidAnson.vscode-markdownlint", + "llvm-vs-code-extensions.vscode-mlir", + "llvm-vs-code-extensions.vscode-clangd", + "llvm-vs-code-extensions.lldb-dap", + "mutantdino.resourcemonitor", + "hoovercj.vscode-power-mode", + "GitHub.copilot-chat", + "Codereviewforgithubcopilot.github-copilot-code-review" + ] + } + } +} diff --git a/mlir/cuda-tile/.devcontainer/noop.txt b/mlir/cuda-tile/.devcontainer/noop.txt new file mode 100644 index 0000000..49de88d --- /dev/null +++ b/mlir/cuda-tile/.devcontainer/noop.txt @@ -0,0 +1,3 @@ +This file copied into the container along with environment.yml* from the parent +folder. This file is included to prevents the Dockerfile COPY instruction from +failing if no environment.yml is found. diff --git a/mlir/cuda-tile/.envsetup.sh b/mlir/cuda-tile/.envsetup.sh new file mode 100644 index 0000000..b4f37a3 --- /dev/null +++ b/mlir/cuda-tile/.envsetup.sh @@ -0,0 +1 @@ +source /root/miniconda3/etc/profile.d/conda.sh && conda activate mlir diff --git a/mlir/cuda-tile/.gitignore b/mlir/cuda-tile/.gitignore new file mode 100644 index 0000000..288c272 --- /dev/null +++ b/mlir/cuda-tile/.gitignore @@ -0,0 +1,3 @@ +*.ptx +*.cubin +*.fatbin diff --git a/mlir/cuda-tile/.pre-commit-config.yaml b/mlir/cuda-tile/.pre-commit-config.yaml new file mode 100644 index 0000000..5736549 --- /dev/null +++ b/mlir/cuda-tile/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: check-yaml + - id: trailing-whitespace + - id: end-of-file-fixer + +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: 'v14.0.6' + hooks: + - id: clang-format + types_or: [c++, c] + +- repo: https://github.com/cheshirekow/cmake-format-precommit + rev: v0.6.10 + hooks: + - id: cmake-format + - id: cmake-lint diff --git a/mlir/cuda-tile/CMakeLists.txt b/mlir/cuda-tile/CMakeLists.txt new file mode 100644 index 0000000..9eb4fb1 --- /dev/null +++ b/mlir/cuda-tile/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.10) + +# note: fix ztd terminfo not found +project(cuda-tile LANGUAGES C CXX) + +# ############## For conda users.################################ +find_package(LLVM CONFIG REQUIRED) +find_package(MLIR CONFIG REQUIRED) +# set(MLIR_TABLEGEN_EXE /root/anaconda3/envs/mlir/bin/mlir-tblgen) +# ############################################################################## + +message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") +message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") +message(STATUS "Found MLIR ${MLIR_PACKAGE_VERSION}") +message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}") +message(STATUS "Found MLIRTableGen: ${MLIR_TABLEGEN_EXE}") +message(STATUS "LLVM_INCLUDE_DIR include dir: ${LLVM_INCLUDE_DIR}") +message(STATUS "MLIR_INCLUDE_DIR include dir: ${MLIR_INCLUDE_DIR}") + +# This is for non-conda users. +find_package(LLVM CONFIG PATHS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/lib/cmake/llvm) +find_package(MLIR CONFIG PATHS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/lib/cmake/mlir) +find_package(CUDAToolkit REQUIRED) +# set(MLIR_TABLEGEN_EXE ${CMAKE_CURRENT_SOURCE_DIR}/third_party/bin/mlir-tblgen) +message(STATUS "CUDA Toolkit found: ${CUDAToolkit_INCLUDE_DIRS}") +message(STATUS "CUDA_TILE_SOURCE_DIR include dir: ${CUDA_TILE_SOURCE_DIR}") +message(STATUS "CUDA_TILE_BINARY_DIR include dir: ${CUDA_TILE_BINARY_DIR}") + +include_directories(${LLVM_INCLUDE_DIR}) +include_directories(${MLIR_INCLUDE_DIR}) +include_directories(${CUDAToolkit_INCLUDE_DIRS}) +include_directories(${CUDA_TILE_SOURCE_DIR}/include) +include_directories(${CUDA_TILE_BINARY_DIR}/include) + +include(LLVMDistributionSupport) +include(TableGen) +include(AddMLIR) +include(AddLLVM) +# include(HandleLLVMOptions) + +# note: fix the llvm::cl undefined reference problem +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -fno-rtti") +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + +add_subdirectory(Toy) diff --git a/mlir/cuda-tile/README.md b/mlir/cuda-tile/README.md new file mode 100644 index 0000000..7ca5b87 --- /dev/null +++ b/mlir/cuda-tile/README.md @@ -0,0 +1,569 @@ +# Standalone environment for MLIR tutorial. + +**NB: The code of this tutorial is from the [mlir-Toy-Example-tutorial](https://mlir.llvm.org/docs/Tutorials/Toy/Ch-1/) and [mlir-transform-tutorial](https://mlir.llvm.org/docs/Tutorials/transform/). +This repo only provide a simple way to setting up the environment. The toy file used in mlir-example all be in [example directory](../example/) and `Ch1-Ch7` is the Toy tutorial example code `Ch8` is an naive example to add `toy.matmul` operation and `transform_Ch2-H` is for transform dialect tutorials** + +## Environment Setup + +### Environment Preparation with conda (Optional) + +- OS must be higher than ubuntu 22.04. +- install gcc-13 and g++-13 + +```bash +apt update -y && \ +apt install -yq gcc-13 g++-13 +# apt install -yq software-properties-common \ +# add-apt-repository -y ppa:ubuntu-toolchain-r/test \ +# apt update -y +# apt install -yq gcc-11 g++-11 +update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 20 +update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 20 +``` + +- install cmake and ninja you can choose one way you like. conda is best for me. + +```bash +conda create -n mlir -y +conda activate mlir +# conda install cmake ninja clang-format clang lld ncurses mlir llvm -c conda-forge +conda install cmake ninja clang-format clang clang-tools mlir zlib spdlog fmt lit llvm=19.* -c conda-forge -y +# create -n mlir cmake ninja clang-format clang mlir zlib spdlog fmt lit llvm -c conda-forge -y +``` + +- build example with conda + +```bash +cd example +bash build_with_conda.sh all +``` + +### Environment Preparation with dev containers + +Please choose the `Dev Containers: Open Folder in Container...` + +- build example with dev containers + +```bash +cd example +bash scripts/sync_deps.sh +bash scripts/build_deps.sh +bash scripts/build_cuda_tile.sh +bash build.sh all +``` + +## Configure the Clangd + +```bash +cd example +# after you configure the project with cmake, you can configure the clangd by run the following command +compdb -p build list > compile_commands.json +``` + +## Run These code and understand mlir + +### Toy Examples + +- Ch1 + +```bash +$./build/Ch1/mlir-example-ch1 Ch1/example.toy -emit=ast +# Module: +# Function +# Proto 'main' @Ch1/example.toy:1:1 +# Params: [] +# Block { +# VarDecl a<> @Ch1/example.toy:4:3 +# Literal: <2, 3>[ <3>[ 1.000000e+00, 2.000000e+00, 3.000000e+00], <3>[ 4.000000e+00, 5.000000e+00, 6.000000e+00]] @Ch1/example.toy:4:11 +# VarDecl b<2, 3> @Ch1/example.toy:8:3 +# Literal: <6>[ 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00] @Ch1/example.toy:8:17 +# Print [ @Ch1/example.toy:12:3 +# BinOp: * @Ch1/example.toy:12:24 +# Call 'transpose' [ @Ch1/example.toy:12:9 +# var: a @Ch1/example.toy:12:19 +# ] +# Call 'transpose' [ @Ch1/example.toy:12:24 +# var: b @Ch1/example.toy:12:34 +# ] +# ] +# } // Block +``` + +- Ch2 + +```bash +$./build/Ch2/mlir-example-ch2 Ch2/codegen.toy -emit=mlir +# module { +# toy.func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { +# %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> +# %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> +# %2 = toy.mul %0, %1 : tensor<*xf64> +# toy.return %2 : tensor<*xf64> +# } +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> +# %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> +# %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64> +# %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# toy.print %5 : tensor<*xf64> +# toy.return +# } +# } +``` + +- Ch3 + +```bash +$./build/Ch3/mlir-example-ch3 Ch3/opt.toy -emit=mlir +# module { +# toy.func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { +# %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> +# %1 = toy.transpose(%0 : tensor<*xf64>) to tensor<*xf64> +# %2 = toy.transpose(%1 : tensor<*xf64>) to tensor<*xf64> +# %3 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> +# %4 = toy.mul %2, %3 : tensor<*xf64> +# toy.return %4 : tensor<*xf64> +# } +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> +# %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> +# %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64> +# %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# %6 = toy.constant dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64> +# %7 = toy.reshape(%6 : tensor<2xf64>) to tensor<2x1xf64> +# %8 = toy.reshape(%7 : tensor<2x1xf64>) to tensor<2x1xf64> +# %9 = toy.reshape(%8 : tensor<2x1xf64>) to tensor<2x1xf64> +# toy.print %5 : tensor<*xf64> +# toy.return +# } +# } +$./build/Ch3/mlir-example-ch3 Ch3/opt.toy -emit=mlir -opt +# module { +# toy.func @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { +# %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> +# %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> +# %2 = toy.mul %0, %1 : tensor<*xf64> +# toy.return %2 : tensor<*xf64> +# } +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %2 = toy.generic_call @multiply_transpose(%0, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# %3 = toy.generic_call @multiply_transpose(%1, %0) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# toy.print %3 : tensor<*xf64> +# toy.return +# } +# } +``` + +- Ch4 + +```bash +$./build/Ch4/mlir-example-ch4 Ch4/opt.toy -emit=mlir +# module { +# toy.func private @multiply_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { +# %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> +# %1 = toy.transpose(%0 : tensor<*xf64>) to tensor<*xf64> +# %2 = toy.transpose(%1 : tensor<*xf64>) to tensor<*xf64> +# %3 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> +# %4 = toy.mul %2, %3 : tensor<*xf64> +# toy.return %4 : tensor<*xf64> +# } +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> +# %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> +# %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<2x3xf64> +# %4 = toy.generic_call @multiply_transpose(%1, %3) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# %5 = toy.generic_call @multiply_transpose(%3, %1) : (tensor<2x3xf64>, tensor<2x3xf64>) -> tensor<*xf64> +# %6 = toy.constant dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64> +# %7 = toy.reshape(%6 : tensor<2xf64>) to tensor<2x1xf64> +# %8 = toy.reshape(%7 : tensor<2x1xf64>) to tensor<2x1xf64> +# %9 = toy.reshape(%8 : tensor<2x1xf64>) to tensor<2x1xf64> +# toy.print %5 : tensor<*xf64> +# toy.return +# } +# } +$./build/Ch4/mlir-example-ch4 Ch4/opt.toy -emit=mlir -opt +# module { +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64> +# %2 = toy.mul %1, %1 : tensor<3x2xf64> +# toy.print %2 : tensor<3x2xf64> +# toy.return +# } +# } +``` + +- Ch5 + +```bash +$ ./build/Ch5/mlir-example-ch5 Ch5/example.toy -emit=mlir-affine +# module { +# func.func @main() { +# %cst = arith.constant 6.000000e+00 : f64 +# %cst_0 = arith.constant 5.000000e+00 : f64 +# %cst_1 = arith.constant 4.000000e+00 : f64 +# %cst_2 = arith.constant 3.000000e+00 : f64 +# %cst_3 = arith.constant 2.000000e+00 : f64 +# %cst_4 = arith.constant 1.000000e+00 : f64 +# %0 = memref.alloc() : memref<3x2xf64> +# %1 = memref.alloc() : memref<3x2xf64> +# %2 = memref.alloc() : memref<2x3xf64> +# affine.store %cst_4, %2[0, 0] : memref<2x3xf64> +# affine.store %cst_3, %2[0, 1] : memref<2x3xf64> +# affine.store %cst_2, %2[0, 2] : memref<2x3xf64> +# affine.store %cst_1, %2[1, 0] : memref<2x3xf64> +# affine.store %cst_0, %2[1, 1] : memref<2x3xf64> +# affine.store %cst, %2[1, 2] : memref<2x3xf64> +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64> +# affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64> +# %4 = arith.mulf %3, %3 : f64 +# affine.store %4, %0[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# toy.print %0 : memref<3x2xf64> +# memref.dealloc %2 : memref<2x3xf64> +# memref.dealloc %1 : memref<3x2xf64> +# memref.dealloc %0 : memref<3x2xf64> +# return +# } +# } +$ ./build/Ch5/mlir-example-ch5 Ch5/example.toy -emit=mlir-affine -opt +# module { +# func.func @main() { +# %cst = arith.constant 6.000000e+00 : f64 +# %cst_0 = arith.constant 5.000000e+00 : f64 +# %cst_1 = arith.constant 4.000000e+00 : f64 +# %cst_2 = arith.constant 3.000000e+00 : f64 +# %cst_3 = arith.constant 2.000000e+00 : f64 +# %cst_4 = arith.constant 1.000000e+00 : f64 +# %0 = memref.alloc() : memref<3x2xf64> +# %1 = memref.alloc() : memref<2x3xf64> +# affine.store %cst_4, %1[0, 0] : memref<2x3xf64> +# affine.store %cst_3, %1[0, 1] : memref<2x3xf64> +# affine.store %cst_2, %1[0, 2] : memref<2x3xf64> +# affine.store %cst_1, %1[1, 0] : memref<2x3xf64> +# affine.store %cst_0, %1[1, 1] : memref<2x3xf64> +# affine.store %cst, %1[1, 2] : memref<2x3xf64> +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %2 = affine.load %1[%arg1, %arg0] : memref<2x3xf64> +# %3 = arith.mulf %2, %2 : f64 +# affine.store %3, %0[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# toy.print %0 : memref<3x2xf64> +# memref.dealloc %1 : memref<2x3xf64> +# memref.dealloc %0 : memref<3x2xf64> +# return +# } +# } +``` + +- Ch6 + +```bash +$ ./build/Ch6/mlir-example-ch6 Ch6/example.toy -emit=jit +# 1.000000 16.000000 +# 4.000000 25.000000 +# 9.000000 36.000000 + +$ ./build/Ch6/mlir-example-ch6 Ch6/example.toy -emit=llvm --mlir-print-ir-after-all +# // -----// IR Dump After Canonicalizer (canonicalize) //----- // +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<*xf64> +# %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<*xf64> +# %4 = toy.mul %2, %3 : tensor<*xf64> +# toy.print %4 : tensor<*xf64> +# toy.return +# } + +# // -----// IR Dump After Inliner (inline) //----- // +# module { +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<*xf64> +# %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<*xf64> +# %4 = toy.mul %2, %3 : tensor<*xf64> +# toy.print %4 : tensor<*xf64> +# toy.return +# } +# } + + +# // -----// IR Dump After {anonymous}::ShapeInferencePass () //----- // +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64> +# %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<3x2xf64> +# %4 = toy.mul %2, %3 : tensor<3x2xf64> +# toy.print %4 : tensor<3x2xf64> +# toy.return +# } + +# // -----// IR Dump After Canonicalizer (canonicalize) //----- // +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %2 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64> +# %3 = toy.transpose(%1 : tensor<2x3xf64>) to tensor<3x2xf64> +# %4 = toy.mul %2, %3 : tensor<3x2xf64> +# toy.print %4 : tensor<3x2xf64> +# toy.return +# } + +# // -----// IR Dump After CSE (cse) //----- // +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.transpose(%0 : tensor<2x3xf64>) to tensor<3x2xf64> +# %2 = toy.mul %1, %1 : tensor<3x2xf64> +# toy.print %2 : tensor<3x2xf64> +# toy.return +# } + +# // -----// IR Dump After {anonymous}::ToyToAffineLoweringPass () //----- // +# module { +# func.func @main() { +# %0 = memref.alloc() : memref<3x2xf64> +# %1 = memref.alloc() : memref<3x2xf64> +# %2 = memref.alloc() : memref<2x3xf64> +# %c0 = arith.constant 0 : index +# %c1 = arith.constant 1 : index +# %c2 = arith.constant 2 : index +# %cst = arith.constant 1.000000e+00 : f64 +# affine.store %cst, %2[%c0, %c0] : memref<2x3xf64> +# %cst_0 = arith.constant 2.000000e+00 : f64 +# affine.store %cst_0, %2[%c0, %c1] : memref<2x3xf64> +# %cst_1 = arith.constant 3.000000e+00 : f64 +# affine.store %cst_1, %2[%c0, %c2] : memref<2x3xf64> +# %cst_2 = arith.constant 4.000000e+00 : f64 +# affine.store %cst_2, %2[%c1, %c0] : memref<2x3xf64> +# %cst_3 = arith.constant 5.000000e+00 : f64 +# affine.store %cst_3, %2[%c1, %c1] : memref<2x3xf64> +# %cst_4 = arith.constant 6.000000e+00 : f64 +# affine.store %cst_4, %2[%c1, %c2] : memref<2x3xf64> +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64> +# affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64> +# %4 = affine.load %1[%arg0, %arg1] : memref<3x2xf64> +# %5 = arith.mulf %3, %4 : f64 +# affine.store %5, %0[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# toy.print %0 : memref<3x2xf64> +# memref.dealloc %2 : memref<2x3xf64> +# memref.dealloc %1 : memref<3x2xf64> +# memref.dealloc %0 : memref<3x2xf64> +# return +# } +# } + + +# // -----// IR Dump After Canonicalizer (canonicalize) //----- // +# func.func @main() { +# %cst = arith.constant 6.000000e+00 : f64 +# %cst_0 = arith.constant 5.000000e+00 : f64 +# %cst_1 = arith.constant 4.000000e+00 : f64 +# %cst_2 = arith.constant 3.000000e+00 : f64 +# %cst_3 = arith.constant 2.000000e+00 : f64 +# %cst_4 = arith.constant 1.000000e+00 : f64 +# %0 = memref.alloc() : memref<3x2xf64> +# %1 = memref.alloc() : memref<3x2xf64> +# %2 = memref.alloc() : memref<2x3xf64> +# affine.store %cst_4, %2[0, 0] : memref<2x3xf64> +# affine.store %cst_3, %2[0, 1] : memref<2x3xf64> +# affine.store %cst_2, %2[0, 2] : memref<2x3xf64> +# affine.store %cst_1, %2[1, 0] : memref<2x3xf64> +# affine.store %cst_0, %2[1, 1] : memref<2x3xf64> +# affine.store %cst, %2[1, 2] : memref<2x3xf64> +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64> +# affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64> +# %4 = affine.load %1[%arg0, %arg1] : memref<3x2xf64> +# %5 = arith.mulf %3, %4 : f64 +# affine.store %5, %0[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# toy.print %0 : memref<3x2xf64> +# memref.dealloc %2 : memref<2x3xf64> +# memref.dealloc %1 : memref<3x2xf64> +# memref.dealloc %0 : memref<3x2xf64> +# return +# } + +# // -----// IR Dump After CSE (cse) //----- // +# func.func @main() { +# %cst = arith.constant 6.000000e+00 : f64 +# %cst_0 = arith.constant 5.000000e+00 : f64 +# %cst_1 = arith.constant 4.000000e+00 : f64 +# %cst_2 = arith.constant 3.000000e+00 : f64 +# %cst_3 = arith.constant 2.000000e+00 : f64 +# %cst_4 = arith.constant 1.000000e+00 : f64 +# %0 = memref.alloc() : memref<3x2xf64> +# %1 = memref.alloc() : memref<3x2xf64> +# %2 = memref.alloc() : memref<2x3xf64> +# affine.store %cst_4, %2[0, 0] : memref<2x3xf64> +# affine.store %cst_3, %2[0, 1] : memref<2x3xf64> +# affine.store %cst_2, %2[0, 2] : memref<2x3xf64> +# affine.store %cst_1, %2[1, 0] : memref<2x3xf64> +# affine.store %cst_0, %2[1, 1] : memref<2x3xf64> +# affine.store %cst, %2[1, 2] : memref<2x3xf64> +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %2[%arg1, %arg0] : memref<2x3xf64> +# affine.store %3, %1[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# affine.for %arg0 = 0 to 3 { +# affine.for %arg1 = 0 to 2 { +# %3 = affine.load %1[%arg0, %arg1] : memref<3x2xf64> +# %4 = arith.mulf %3, %3 : f64 +# affine.store %4, %0[%arg0, %arg1] : memref<3x2xf64> +# } +# } +# toy.print %0 : memref<3x2xf64> +# memref.dealloc %2 : memref<2x3xf64> +# memref.dealloc %1 : memref<3x2xf64> +# memref.dealloc %0 : memref<3x2xf64> +# return +# } + +# // -----// IR Dump After {anonymous}::ToyToLLVMLoweringPass () //----- // +# module { +# llvm.func @free(!llvm.ptr) +# llvm.mlir.global internal constant @nl("\0A\00") +# llvm.mlir.global internal constant @frmt_spec("%f \00") +# llvm.func @printf(!llvm.ptr, ...) -> i32 +# llvm.func @malloc(i64) -> !llvm.ptr +# llvm.func @main() { +# ... +``` + +- Ch7 + +```bash +$ ./build/Ch7/mlir-example-ch7 Ch7/struct-codegen.toy -emit=jit +# 1.000000 16.000000 +# 4.000000 25.000000 +# 9.000000 36.000000 +``` + +- Ch8 + +```bash +$ ./build/Ch8/mlir-example-ch8 Ch8/matmul.toy.mlir -emit=mlir +# module { +# toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { +# %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> +# %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> +# %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64> +# toy.return %2 : tensor<*xf64> +# } +# toy.func @main() { +# %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> +# %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> +# %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> +# %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64> +# %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64> +# toy.print %4 : tensor<*xf64> +# toy.return +# } +# } +``` + +```bash +$ ./build/Ch8/mlir-example-ch8 Ch8/matmul.toy -emit=jit +# 14.000000 32.000000 +# 32.000000 77.000000 +``` + +### Transform Dialect + +Please flow the [mlir-transform-tutorial](https://mlir.llvm.org/docs/Tutorials/transform/). If you have some questions about the way to run these examples, please check the top lines of each mlir files. + +- transform Ch2 + +```bash +$ ./build/transform_Ch2/transform-opt-ch2 --transform-interpreter transform_Ch2/ops.mlir +# module { +# func.func private @orig() +# func.func private @updated() +# func.func @test() { +# call @updated() : () -> () # <---- This will be changed to @updated from @orig +# return +# } +# module attributes {transform.with_named_sequence} { +# transform.named_sequence @__transform_main(%arg0: !transform.any_op) { +# %0 = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op +# transform.my.change_call_target %0, "updated" : !transform.any_op +# transform.yield +# } +# } +# } +``` + +- transform Ch3 + +```bash +$ ./build/transform_Ch3/transform-opt-ch3 --transform-interpreter transform_Ch3/ops.mlir --allow-unregistered-dialect --split-input-file +# module { +# func.func private @orig() +# func.func private @updated() +# func.func @test1() { +# call @updated() : () -> () +# return +# } +# module attributes {transform.with_named_sequence} { +# transform.named_sequence @__transform_main(%arg0: !transform.any_op) { +# %0 = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call"> +# transform.my.change_call_target %0, "updated" : !transform.op<"func.call"> +# transform.yield +# } +# } +# } + +# // ----- +# module { +# func.func private @orig() +# func.func @test2() { +# "my.mm4"() : () -> () +# return +# } +# module attributes {transform.with_named_sequence} { +# transform.named_sequence @__transform_main(%arg0: !transform.any_op) { +# %0 = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface +# %1 = transform.my.call_to_op %0 : (!transform.my.call_op_interface) -> !transform.any_op +# transform.yield +# } +# } +# } +``` diff --git a/mlir/cuda-tile/Toy/CMakeLists.txt b/mlir/cuda-tile/Toy/CMakeLists.txt new file mode 100644 index 0000000..0f4f6b9 --- /dev/null +++ b/mlir/cuda-tile/Toy/CMakeLists.txt @@ -0,0 +1,70 @@ +# For a better template to copy, see examples/standalone +include_directories(include) +add_subdirectory(include) + +set(LLVM_LINK_COMPONENTS + Core + Support + nativecodegen + OrcJIT + ) + +set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td) +mlir_tablegen(ToyCombine.inc -gen-rewriters) +add_public_tablegen_target(ToyCudaCombineIncGen) + +add_subdirectory(cuda_wrapper) + +add_executable( + toy-cuda + toyc.cpp + parser/AST.cpp + mlir/MLIRGen.cpp + mlir/Dialect.cpp + mlir/LowerToAffineLoops.cpp + mlir/LowerToLLVM.cpp + mlir/ShapeInferencePass.cpp + mlir/ToyCombine.cpp + mlir/LowerToGpu.cpp + mlir/LowerToCudaTile.cpp + mlir/EmitCudaTile.cpp + ) + +add_dependencies(toy-cuda + ToyCudaShapeInferenceInterfaceIncGen + ToyCudaOpsIncGen + ToyCudaCombineIncGen + ) + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/) +target_link_directories(toy-cuda PRIVATE ${CUDA_TILE_BINARY_DIR}/lib) + +target_link_libraries(toy-cuda + PRIVATE + MLIRAnalysis + MLIRBuiltinToLLVMIRTranslation + MLIRCallInterfaces + MLIRCastInterfaces + MLIRExecutionEngine + MLIRFunctionInterfaces + MLIRIR + MLIRLLVMCommonConversion + MLIRLLVMDialect + MLIRLLVMToLLVMIRTranslation + MLIRMemRefDialect + MLIRParser + MLIRPass + MLIRRegisterAllDialects + MLIRRegisterAllExtensions + MLIRRegisterAllPasses + MLIRSideEffectInterfaces + MLIRSupport + MLIRTargetLLVMIRExport + MLIRTransforms + CudaTileDialect + CudaTileTransforms + CudaTileBytecodeWriter + CudaTileBytecodeCommon + cuda_shim + ) diff --git a/mlir/cuda-tile/Toy/cuda_wrapper/CMakeLists.txt b/mlir/cuda-tile/Toy/cuda_wrapper/CMakeLists.txt new file mode 100644 index 0000000..39b0a8a --- /dev/null +++ b/mlir/cuda-tile/Toy/cuda_wrapper/CMakeLists.txt @@ -0,0 +1,3 @@ +add_library(cuda_shim STATIC cuda_shim.cpp) +target_include_directories(cuda_shim PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) +target_link_libraries(cuda_shim PRIVATE CUDA::cuda_driver CUDA::cudart_static) diff --git a/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp b/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp new file mode 100644 index 0000000..3b5d351 --- /dev/null +++ b/mlir/cuda-tile/Toy/cuda_wrapper/cuda_shim.cpp @@ -0,0 +1,528 @@ +//===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements C wrappers around the CUDA library for easy linking in ORC jit. +// Also adds some debugging helpers that are helpful when writing MLIR code to +// run on GPUs. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "cuda.h" +#include "cuda_bf16.h" +#include "cuda_fp16.h" +#include + +// We assume the program runs on the linux platform if not on Windows. +// Copy from +// third_party/llvm-project/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp + +#if CUDA_VERSION >= 13000 + +#define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default"))) + +#define CUDA_REPORT_IF_ERROR(expr) \ + [](CUresult result) { \ + if (!result) \ + return; \ + const char *name = nullptr; \ + cuGetErrorName(result, &name); \ + if (!name) \ + name = ""; \ + fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ + }(expr) + +thread_local static int32_t defaultDevice = 0; + +/// Helper method that checks environment value for debugging. +static bool isDebugEnabled() { + const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG"; + static bool isEnabled = getenv(kDebugEnvironmentVariable) != nullptr; + return isEnabled; +} + +#define debug_print(fmt, ...) \ + do { \ + if (isDebugEnabled()) \ + fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \ + __func__, __VA_ARGS__); \ + } while (0) + +// Returns default CUdevice +static CUdevice getDefaultCuDevice() { + CUdevice device; + CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); + return device; +} + +// Make the primary context of the current default device current for the +// duration +// of the instance and restore the previous context on destruction. +class ScopedContext { +public: + ScopedContext() { + // Static reference to CUDA primary context for device ordinal + // defaultDevice. + static CUcontext context = [] { + CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0)); + CUcontext ctx; + // Note: this does not affect the current context. + CUDA_REPORT_IF_ERROR( + cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice())); + return ctx; + }(); + + CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context)); + } + + ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } +}; + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule +mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { + ScopedContext scopedContext; + CUmodule module = nullptr; + CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); + return module; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data, + int optLevel) { + ScopedContext scopedContext; + CUmodule module = nullptr; + char jitErrorBuffer[4096] = {0}; + CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_OPTIMIZATION_LEVEL}; + void *jitOptionsVals[] = {jitErrorBuffer, + reinterpret_cast(sizeof(jitErrorBuffer)), + reinterpret_cast(optLevel)}; + + CUresult result = + cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals); + if (result) { + fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer); + CUDA_REPORT_IF_ERROR(result); + } + return module; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) { + CUDA_REPORT_IF_ERROR(cuModuleUnload(module)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction +mgpuModuleGetFunction(CUmodule module, const char *name) { + CUfunction function = nullptr; + CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); + return function; +} + +// The wrapper uses intptr_t instead of CUDA's unsigned int to match +// the type of MLIR's index type. This avoids the need for casts in the +// generated MLIR code. +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, + intptr_t gridZ, intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, CUstream stream, void **params, + void **extra, size_t /*paramsCount*/) { + ScopedContext scopedContext; + if (smem > 0) { + // Avoid checking driver as it's more expensive than if statement + int32_t maxShmem = 0; + CUdevice device = getDefaultCuDevice(); + CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); + CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute( + &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, + device)); + if (maxShmem < smem) { + fprintf(stderr, + "Requested shared memory (%dkb) is larger than maximum allowed " + "shared memory (%dkb) for this device\n", + smem, maxShmem); + } + CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( + function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem)); + } + debug_print("Launching kernel, grid=%ld,%ld,%ld, " + "threads: %ld, %ld, %ld, " + "smem: %dkb\n", + gridX, gridY, gridZ, blockX, blockY, blockZ, smem); + CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, + blockY, blockZ, smem, stream, params, + extra)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() { + ScopedContext scopedContext; + CUstream stream = nullptr; + CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + return stream; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) { + CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuStreamSynchronize(CUstream stream) { + CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream, + CUevent event) { + CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() { + ScopedContext scopedContext; + CUevent event = nullptr; + CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); + return event; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) { + CUDA_REPORT_IF_ERROR(cuEventDestroy(event)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) { + CUDA_REPORT_IF_ERROR(cuEventSynchronize(event)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event, + CUstream stream) { + CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) { + ScopedContext scopedContext; + CUdeviceptr ptr = 0; + if (sizeBytes == 0) + return reinterpret_cast(ptr); + + if (isHostShared) { + CUDA_REPORT_IF_ERROR( + cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL)); + return reinterpret_cast(ptr); + } + CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); + return reinterpret_cast(ptr); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr, + CUstream /*stream*/) { + CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast(ptr))); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemcpy(void *dst, void *src, size_t sizeBytes, CUstream stream) { + CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast(dst), + reinterpret_cast(src), + sizeBytes, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemset32(void *dst, unsigned int value, size_t count, CUstream stream) { + CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast(dst), + value, count, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemset16(void *dst, unsigned short value, size_t count, CUstream stream) { + CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast(dst), + value, count, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { + defaultDevice = device; +} + +// ===----------------------------------------------------------------------===// + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCtxSynchronize() { + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuCtxSynchronize()); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyHtoD(void *dst, void *src, + size_t sizeBytes) { + CUDA_REPORT_IF_ERROR( + cuMemcpyHtoD(reinterpret_cast(dst), src, sizeBytes)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyDtoH(void *dst, void *src, + size_t sizeBytes) { + CUDA_REPORT_IF_ERROR( + cuMemcpyDtoH(dst, reinterpret_cast(src), sizeBytes)); +} + +//===----------------------------------------------------------------------===// + +static inline CUdeviceptr asDevPtr(uint64_t h) { + return static_cast(h); +} +static inline uint64_t asHandle(CUdeviceptr p) { + return static_cast(p); +} + +static inline CUstream asStream(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} +static inline uint64_t asStreamHandle(CUstream s) { + return static_cast(reinterpret_cast(s)); +} + +static inline CUevent asEvent(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} +static inline uint64_t asEventHandle(CUevent e) { + return static_cast(reinterpret_cast(e)); +} + +static inline void *asHostPtr(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} +static inline const void *asHostCPtr(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} + +// Align up helper +static inline uint64_t alignUp(uint64_t x, uint64_t a) { + return (x + (a - 1)) & ~(a - 1); +} + +// Load module from PTX or CUBIN image in memory. +// Driver API supports cuModuleLoadDataEx for both PTX and cubin (it +// auto-detects). +extern "C" uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr, + uint64_t image_nbytes) { + + (void)image_nbytes; + auto data = const_cast(asHostCPtr(image_ptr)); + CUmodule mod = mgpuModuleLoad(data, image_nbytes); + return static_cast(reinterpret_cast(mod)); +} + +extern "C" uint64_t cuda_shim_load_module_jit_from_image(uint64_t image_ptr, + uint64_t image_nbytes, + int opt_level) { + + (void)image_nbytes; + auto data = const_cast(asHostCPtr(image_ptr)); + CUmodule mod = mgpuModuleLoadJIT(data, opt_level); + return static_cast(reinterpret_cast(mod)); +} + +extern "C" uint64_t +cuda_shim_load_module_from_file(uint64_t file_path_ptr, + uint64_t /*file_path_nbytes*/) { + auto file_path_cstr = + reinterpret_cast(asHostCPtr(file_path_ptr)); + // fprintf(stdout, "%s", file_path_cstr); + CUmodule module = nullptr; + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuModuleLoad(&module, file_path_cstr)); + return static_cast(reinterpret_cast(module)); +} + +extern "C" void cuda_shim_unload_module(uint64_t module_handle) { + CUmodule module = + reinterpret_cast(static_cast(module_handle)); + mgpuModuleUnload(module); +} + +extern "C" uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream, + bool is_host_shared) { + CUstream cu_stream = asStream(stream); + if (stream == 0) + cu_stream = nullptr; + void *ptr = mgpuMemAlloc(nbytes, /*stream=*/cu_stream, + /*isHostShared=*/is_host_shared); + return static_cast(reinterpret_cast(ptr)); +} + +extern "C" void cuda_shim_free(uint64_t dptr, uint64_t stream) { + CUstream cu_stream = asStream(stream); + void *ptr = reinterpret_cast(static_cast(dptr)); + if (stream == 0) { + cu_stream = nullptr; + } + mgpuMemFree(ptr, /*stream=*/cu_stream); +} + +extern "C" void cuda_shim_memset32(uint64_t dptr, uint32_t value, + uint64_t count_dwords, uint64_t stream) { + void *ptr = reinterpret_cast(static_cast(dptr)); + CUstream cu_stream = asStream(stream); + mgpuMemset32(ptr, value, count_dwords, cu_stream); +} + +extern "C" void cuda_shim_memset16(uint64_t dptr, uint32_t value, + uint64_t count_dwords, uint64_t stream) { + void *ptr = reinterpret_cast(static_cast(dptr)); + CUstream cu_stream = asStream(stream); + mgpuMemset16(ptr, value, count_dwords, cu_stream); +} + +extern "C" uint64_t cuda_shim_stream_create(void) { + CUstream stream = mgpuStreamCreate(); + return asStreamHandle(stream); +} + +extern "C" void cuda_shim_stream_destroy(uint64_t stream) { + CUstream cu_stream = asStream(stream); + mgpuStreamDestroy(cu_stream); +} + +extern "C" void cuda_shim_stream_synchronize(uint64_t stream) { + CUstream cu_stream = asStream(stream); + mgpuStreamSynchronize(cu_stream); +} + +extern "C" uint64_t cuda_shim_event_create(void) { + CUevent event = mgpuEventCreate(); + return asEventHandle(event); +} + +extern "C" void cuda_shim_event_destroy(uint64_t ev) { + CUevent event = asEvent(ev); + mgpuEventDestroy(event); +} + +extern "C" void cuda_shim_event_record(uint64_t ev, uint64_t stream) { + CUevent event = asEvent(ev); + CUstream cu_stream = asStream(stream); + mgpuEventRecord(event, cu_stream); +} + +extern "C" void cuda_shim_event_synchronize(uint64_t ev) { + CUevent event = asEvent(ev); + mgpuEventSynchronize(event); +} + +extern "C" void cuda_shim_stream_wait_event(uint64_t stream, uint64_t ev) { + CUstream cu_stream = asStream(stream); + CUevent event = asEvent(ev); + mgpuStreamWaitEvent(cu_stream, event); +} + +// ----------------------------- Memcpy (raw ABI) -------------------------- +// Host pointers are passed as uint64_t. This is the key of 2A. + +extern "C" void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr, + uint64_t nbytes) { + ScopedContext scopedContext; + auto dst = asHostPtr(dst_dptr); + auto src = asHostPtr(src_hptr); + mgpuMemcpyHtoD(dst, src, static_cast(nbytes)); +} + +extern "C" void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr, + uint64_t nbytes) { + ScopedContext scopedContext; + auto dst = asHostPtr(dst_hptr); + auto src = asHostPtr(src_dptr); + mgpuMemcpyDtoH(dst, src, static_cast(nbytes)); +} + +// ----------------------------- Kernel launch ----------------------------- +// The hardest part is kernelParams (void**). +// We avoid building it in MLIR. Instead MLIR passes: +// - arg_data_ptr: host pointer to a packed buffer containing raw argument bytes +// - arg_sizes_ptr: host pointer to uint64_t[num_args], each is the byte-size of +// that argument The shim constructs kernelParams[i] = &arg_data[offset_i] with +// 8-byte alignment. This matches typical ABI expectations for scalar/pointer +// args. If you have special alignment requirements, extend this (e.g., per-arg +// alignment array). + +extern "C" void cuda_shim_launch_packed( + uint64_t module_handle, uint64_t kernel_name_ptr, uint32_t gridX, + uint32_t gridY, uint32_t gridZ, uint32_t blockX, uint32_t blockY, + uint32_t blockZ, uint32_t sharedMemBytes, uint64_t stream, + uint64_t arg_data_ptr, uint64_t arg_sizes_ptr, uint32_t num_args) { + + auto mh = reinterpret_cast(static_cast(module_handle)); + if (!mh) { + fprintf(stderr, "[cuda_shim] launch_packed: invalid module handle\n"); + abort(); + } + + const char *kname = + reinterpret_cast(asHostCPtr(kernel_name_ptr)); + if (!kname) { + fprintf(stderr, "[cuda_shim] launch_packed: null kernel name\n"); + abort(); + } + + CUfunction fn = mgpuModuleGetFunction(mh, kname); + + auto *argData = reinterpret_cast(asHostPtr(arg_data_ptr)); + auto *argSizes = + reinterpret_cast(asHostCPtr(arg_sizes_ptr)); + + if (num_args > 0 && (!argData || !argSizes)) { + fprintf(stderr, "[cuda_shim] launch_packed: argData/argSizes null\n"); + abort(); + } + + // Build kernelParams array on heap (safe for large num_args). + std::vector params; + params.resize(num_args); + + uint64_t off = 0; + for (uint32_t i = 0; i < num_args; ++i) { + // 8-byte align each argument start (common safe default). + off = alignUp(off, 8); + params[i] = argData + off; + off += argSizes[i]; + } + + auto cu_stream = asStream(stream); + + if (stream == 0) { + cu_stream = nullptr; + } + + mgpuLaunchKernel(fn, static_cast(gridX), + static_cast(gridY), static_cast(gridZ), + static_cast(blockX), static_cast(blockY), + static_cast(blockZ), + static_cast(sharedMemBytes), cu_stream, + params.data(), nullptr, static_cast(num_args)); +} + +// Convenience: 1D launch, shared=0, stream optional +extern "C" void +cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr, + uint32_t blockX, uint32_t blockY, uint32_t blockZ, + uint64_t stream, uint64_t arg_data_ptr, + uint64_t arg_sizes_ptr, uint32_t num_args) { + cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX, + blockY, blockZ, 0, stream, arg_data_ptr, + arg_sizes_ptr, num_args); +} + +// Optional: global sync (avoid in async pipeline; prefer event/stream sync) +extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); } + +// only for debugging +// extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) { +// auto *p = reinterpret_cast(static_cast(dptr)); +// for (uint32_t i = 0; i < n; ++i) { +// fprintf(stderr, "i=%u v=%f\n", i, p[i]); +// } +// } + +#endif diff --git a/mlir/cuda-tile/Toy/include/CMakeLists.txt b/mlir/cuda-tile/Toy/include/CMakeLists.txt new file mode 100644 index 0000000..37c89d0 --- /dev/null +++ b/mlir/cuda-tile/Toy/include/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(toy) diff --git a/mlir/cuda-tile/Toy/include/toy/AST.h b/mlir/cuda-tile/Toy/include/toy/AST.h new file mode 100644 index 0000000..d2ba101 --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/AST.h @@ -0,0 +1,246 @@ +//===- AST.h - Node definition for the Toy AST ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the AST for the Toy language. It is optimized for +// simplicity, not efficiency. The AST forms a tree structure where each node +// references its children using std::unique_ptr<>. +// +//===----------------------------------------------------------------------===// + +#ifndef TOY_AST_H +#define TOY_AST_H + +#include "toy/Lexer.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include +#include +#include + +namespace toy { + +/// A variable type with shape information. +struct VarType { + std::vector shape; +}; + +/// Base class for all expression nodes. +class ExprAST { +public: + enum ExprASTKind { + Expr_VarDecl, + Expr_Return, + Expr_Num, + Expr_Literal, + Expr_Var, + Expr_BinOp, + Expr_Call, + Expr_Print, + }; + + ExprAST(ExprASTKind kind, Location location) + : kind(kind), location(std::move(location)) {} + virtual ~ExprAST() = default; + + ExprASTKind getKind() const { return kind; } + + const Location &loc() { return location; } + +private: + const ExprASTKind kind; + Location location; +}; + +/// A block-list of expressions. +using ExprASTList = std::vector>; + +/// Expression class for numeric literals like "1.0". +class NumberExprAST : public ExprAST { + double val; + +public: + NumberExprAST(Location loc, double val) + : ExprAST(Expr_Num, std::move(loc)), val(val) {} + + double getValue() { return val; } + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_Num; } +}; + +/// Expression class for a literal value. +class LiteralExprAST : public ExprAST { + std::vector> values; + std::vector dims; + +public: + LiteralExprAST(Location loc, std::vector> values, + std::vector dims) + : ExprAST(Expr_Literal, std::move(loc)), values(std::move(values)), + dims(std::move(dims)) {} + + llvm::ArrayRef> getValues() { return values; } + llvm::ArrayRef getDims() { return dims; } + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_Literal; } +}; + +/// Expression class for referencing a variable, like "a". +class VariableExprAST : public ExprAST { + std::string name; + +public: + VariableExprAST(Location loc, llvm::StringRef name) + : ExprAST(Expr_Var, std::move(loc)), name(name) {} + + llvm::StringRef getName() { return name; } + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_Var; } +}; + +/// Expression class for defining a variable. +class VarDeclExprAST : public ExprAST { + std::string name; + VarType type; + std::unique_ptr initVal; + +public: + VarDeclExprAST(Location loc, llvm::StringRef name, VarType type, + std::unique_ptr initVal) + : ExprAST(Expr_VarDecl, std::move(loc)), name(name), + type(std::move(type)), initVal(std::move(initVal)) {} + + llvm::StringRef getName() { return name; } + ExprAST *getInitVal() { return initVal.get(); } + const VarType &getType() { return type; } + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_VarDecl; } +}; + +/// Expression class for a return operator. +class ReturnExprAST : public ExprAST { + std::optional> expr; + +public: + ReturnExprAST(Location loc, std::optional> expr) + : ExprAST(Expr_Return, std::move(loc)), expr(std::move(expr)) {} + + std::optional getExpr() { + if (expr.has_value()) + return expr->get(); + return std::nullopt; + } + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_Return; } +}; + +/// Expression class for a binary operator. +class BinaryExprAST : public ExprAST { + char op; + std::unique_ptr lhs, rhs; + +public: + char getOp() { return op; } + ExprAST *getLHS() { return lhs.get(); } + ExprAST *getRHS() { return rhs.get(); } + + BinaryExprAST(Location loc, char op, std::unique_ptr lhs, + std::unique_ptr rhs) + : ExprAST(Expr_BinOp, std::move(loc)), op(op), lhs(std::move(lhs)), + rhs(std::move(rhs)) {} + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_BinOp; } +}; + +/// Expression class for function calls. +class CallExprAST : public ExprAST { + std::string callee; + std::vector> args; + +public: + CallExprAST(Location loc, const std::string &callee, + std::vector> args) + : ExprAST(Expr_Call, std::move(loc)), callee(callee), + args(std::move(args)) {} + + llvm::StringRef getCallee() { return callee; } + llvm::ArrayRef> getArgs() { return args; } + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_Call; } +}; + +/// Expression class for builtin print calls. +class PrintExprAST : public ExprAST { + std::unique_ptr arg; + +public: + PrintExprAST(Location loc, std::unique_ptr arg) + : ExprAST(Expr_Print, std::move(loc)), arg(std::move(arg)) {} + + ExprAST *getArg() { return arg.get(); } + + /// LLVM style RTTI + static bool classof(const ExprAST *c) { return c->getKind() == Expr_Print; } +}; + +/// This class represents the "prototype" for a function, which captures its +/// name, and its argument names (thus implicitly the number of arguments the +/// function takes). +class PrototypeAST { + Location location; + std::string name; + std::vector> args; + +public: + PrototypeAST(Location location, const std::string &name, + std::vector> args) + : location(std::move(location)), name(name), args(std::move(args)) {} + + const Location &loc() { return location; } + llvm::StringRef getName() const { return name; } + llvm::ArrayRef> getArgs() { return args; } +}; + +/// This class represents a function definition itself. +class FunctionAST { + std::unique_ptr proto; + std::unique_ptr body; + +public: + FunctionAST(std::unique_ptr proto, + std::unique_ptr body) + : proto(std::move(proto)), body(std::move(body)) {} + PrototypeAST *getProto() { return proto.get(); } + ExprASTList *getBody() { return body.get(); } +}; + +/// This class represents a list of functions to be processed together +class ModuleAST { + std::vector functions; + +public: + ModuleAST(std::vector functions) + : functions(std::move(functions)) {} + + auto begin() { return functions.begin(); } + auto end() { return functions.end(); } +}; + +void dump(ModuleAST &); + +} // namespace toy + +#endif // TOY_AST_H diff --git a/mlir/cuda-tile/Toy/include/toy/CMakeLists.txt b/mlir/cuda-tile/Toy/include/toy/CMakeLists.txt new file mode 100644 index 0000000..58f7e8e --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/CMakeLists.txt @@ -0,0 +1,13 @@ +# Most dialects should use add_mlir_dialect(). See examples/standalone. +set(LLVM_TARGET_DEFINITIONS Ops.td) +mlir_tablegen(Ops.h.inc -gen-op-decls) +mlir_tablegen(Ops.cpp.inc -gen-op-defs) +mlir_tablegen(Dialect.h.inc -gen-dialect-decls) +mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs) +add_public_tablegen_target(ToyCudaOpsIncGen) + +# Most dialects should use add_mlir_interfaces(). +set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td) +mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls) +mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs) +add_public_tablegen_target(ToyCudaShapeInferenceInterfaceIncGen) diff --git a/mlir/cuda-tile/Toy/include/toy/Dialect.h b/mlir/cuda-tile/Toy/include/toy/Dialect.h new file mode 100644 index 0000000..5db325e --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/Dialect.h @@ -0,0 +1,36 @@ +//===- Dialect.h - Dialect definition for the Toy IR ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the IR Dialect for the Toy language. +// See docs/Tutorials/Toy/Ch-2.md for more information. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TUTORIAL_TOY_DIALECT_H_ +#define MLIR_TUTORIAL_TOY_DIALECT_H_ + +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/Interfaces/CallInterfaces.h" +#include "mlir/Interfaces/CastInterfaces.h" +#include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "toy/ShapeInferenceInterface.h" + +/// Include the auto-generated header file containing the declaration of the toy +/// dialect. +#include "toy/Dialect.h.inc" + +/// Include the auto-generated header file containing the declarations of the +/// toy operations. +#define GET_OP_CLASSES +#include "toy/Ops.h.inc" + +#endif // MLIR_TUTORIAL_TOY_DIALECT_H_ diff --git a/mlir/cuda-tile/Toy/include/toy/Lexer.h b/mlir/cuda-tile/Toy/include/toy/Lexer.h new file mode 100644 index 0000000..22822cc --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/Lexer.h @@ -0,0 +1,233 @@ +//===- Lexer.h - Lexer for the Toy language -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple Lexer for the Toy language. +// +//===----------------------------------------------------------------------===// + +#ifndef TOY_LEXER_H +#define TOY_LEXER_H + +#include "llvm/ADT/StringRef.h" + +#include +#include +#include + +namespace toy { + +/// Structure definition a location in a file. +struct Location { + std::shared_ptr file; ///< filename. + int line; ///< line number. + int col; ///< column number. +}; + +// List of Token returned by the lexer. +enum Token : int { + tok_semicolon = ';', + tok_parenthese_open = '(', + tok_parenthese_close = ')', + tok_bracket_open = '{', + tok_bracket_close = '}', + tok_sbracket_open = '[', + tok_sbracket_close = ']', + + tok_eof = -1, + + // commands + tok_return = -2, + tok_var = -3, + tok_def = -4, + + // primary + tok_identifier = -5, + tok_number = -6, +}; + +/// The Lexer is an abstract base class providing all the facilities that the +/// Parser expects. It goes through the stream one token at a time and keeps +/// track of the location in the file for debugging purpose. +/// It relies on a subclass to provide a `readNextLine()` method. The subclass +/// can proceed by reading the next line from the standard input or from a +/// memory mapped file. +class Lexer { +public: + /// Create a lexer for the given filename. The filename is kept only for + /// debugging purpose (attaching a location to a Token). + Lexer(std::string filename) + : lastLocation( + {std::make_shared(std::move(filename)), 0, 0}) {} + virtual ~Lexer() = default; + + /// Look at the current token in the stream. + Token getCurToken() { return curTok; } + + /// Move to the next token in the stream and return it. + Token getNextToken() { return curTok = getTok(); } + + /// Move to the next token in the stream, asserting on the current token + /// matching the expectation. + void consume(Token tok) { + assert(tok == curTok && "consume Token mismatch expectation"); + getNextToken(); + } + + /// Return the current identifier (prereq: getCurToken() == tok_identifier) + llvm::StringRef getId() { + assert(curTok == tok_identifier); + return identifierStr; + } + + /// Return the current number (prereq: getCurToken() == tok_number) + double getValue() { + assert(curTok == tok_number); + return numVal; + } + + /// Return the location for the beginning of the current token. + Location getLastLocation() { return lastLocation; } + + // Return the current line in the file. + int getLine() { return curLineNum; } + + // Return the current column in the file. + int getCol() { return curCol; } + +private: + /// Delegate to a derived class fetching the next line. Returns an empty + /// string to signal end of file (EOF). Lines are expected to always finish + /// with "\n" + virtual llvm::StringRef readNextLine() = 0; + + /// Return the next character from the stream. This manages the buffer for the + /// current line and request the next line buffer to the derived class as + /// needed. + int getNextChar() { + // The current line buffer should not be empty unless it is the end of file. + if (curLineBuffer.empty()) + return EOF; + ++curCol; + auto nextchar = curLineBuffer.front(); + curLineBuffer = curLineBuffer.drop_front(); + if (curLineBuffer.empty()) + curLineBuffer = readNextLine(); + if (nextchar == '\n') { + ++curLineNum; + curCol = 0; + } + return nextchar; + } + + /// Return the next token from standard input. + Token getTok() { + // Skip any whitespace. + while (isspace(lastChar)) + lastChar = Token(getNextChar()); + + // Save the current location before reading the token characters. + lastLocation.line = curLineNum; + lastLocation.col = curCol; + + // Identifier: [a-zA-Z][a-zA-Z0-9_]* + if (isalpha(lastChar)) { + identifierStr = (char)lastChar; + while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_') + identifierStr += (char)lastChar; + + if (identifierStr == "return") + return tok_return; + if (identifierStr == "def") + return tok_def; + if (identifierStr == "var") + return tok_var; + return tok_identifier; + } + + // Number: [0-9.]+ + if (isdigit(lastChar) || lastChar == '.') { + std::string numStr; + do { + numStr += lastChar; + lastChar = Token(getNextChar()); + } while (isdigit(lastChar) || lastChar == '.'); + + numVal = strtod(numStr.c_str(), nullptr); + return tok_number; + } + + if (lastChar == '#') { + // Comment until end of line. + do { + lastChar = Token(getNextChar()); + } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r'); + + if (lastChar != EOF) + return getTok(); + } + + // Check for end of file. Don't eat the EOF. + if (lastChar == EOF) + return tok_eof; + + // Otherwise, just return the character as its ascii value. + Token thisChar = Token(lastChar); + lastChar = Token(getNextChar()); + return thisChar; + } + + /// The last token read from the input. + Token curTok = tok_eof; + + /// Location for `curTok`. + Location lastLocation; + + /// If the current Token is an identifier, this string contains the value. + std::string identifierStr; + + /// If the current Token is a number, this contains the value. + double numVal = 0; + + /// The last value returned by getNextChar(). We need to keep it around as we + /// always need to read ahead one character to decide when to end a token and + /// we can't put it back in the stream after reading from it. + Token lastChar = Token(' '); + + /// Keep track of the current line number in the input stream + int curLineNum = 0; + + /// Keep track of the current column number in the input stream + int curCol = 0; + + /// Buffer supplied by the derived class on calls to `readNextLine()` + llvm::StringRef curLineBuffer = "\n"; +}; + +/// A lexer implementation operating on a buffer in memory. +class LexerBuffer final : public Lexer { +public: + LexerBuffer(const char *begin, const char *end, std::string filename) + : Lexer(std::move(filename)), current(begin), end(end) {} + +private: + /// Provide one line at a time to the Lexer, return an empty string when + /// reaching the end of the buffer. + llvm::StringRef readNextLine() override { + auto *begin = current; + while (current <= end && *current && *current != '\n') + ++current; + if (current <= end && *current) + ++current; + llvm::StringRef result{begin, static_cast(current - begin)}; + return result; + } + const char *current, *end; +}; +} // namespace toy + +#endif // TOY_LEXER_H diff --git a/mlir/cuda-tile/Toy/include/toy/MLIRGen.h b/mlir/cuda-tile/Toy/include/toy/MLIRGen.h new file mode 100644 index 0000000..fe9dbe5 --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/MLIRGen.h @@ -0,0 +1,35 @@ +//===- MLIRGen.h - MLIR Generation from a Toy AST -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares a simple interface to perform IR generation targeting MLIR +// from a Module AST for the Toy language. +// +//===----------------------------------------------------------------------===// + +#ifndef TOY_MLIRGEN_H +#define TOY_MLIRGEN_H + +#include + +namespace mlir { +class MLIRContext; +template +class OwningOpRef; +class ModuleOp; +} // namespace mlir + +namespace toy { +class ModuleAST; + +/// Emit IR for the given Toy moduleAST, returns a newly created MLIR module +/// or nullptr on failure. +mlir::OwningOpRef mlirGen(mlir::MLIRContext &context, + ModuleAST &moduleAST); +} // namespace toy + +#endif // TOY_MLIRGEN_H diff --git a/mlir/cuda-tile/Toy/include/toy/Ops.td b/mlir/cuda-tile/Toy/include/toy/Ops.td new file mode 100644 index 0000000..5aa524c --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/Ops.td @@ -0,0 +1,498 @@ +//===- Ops.td - Toy dialect operation definitions ----------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the operations of the Toy dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef TOY_OPS +#define TOY_OPS + +include "mlir/Interfaces/FunctionInterfaces.td" +include "mlir/IR/SymbolInterfaces.td" +include "mlir/Interfaces/CallInterfaces.td" +include "mlir/Interfaces/CastInterfaces.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "toy/ShapeInferenceInterface.td" + +def F32ElementsAttr : FloatElementsAttr<32>; + +// Provide a definition of the 'toy' dialect in the ODS framework so that we +// can define our operations. +def Toy_Dialect : Dialect { + let name = "toy"; + let cppNamespace = "::mlir::toy"; +} + +// Base class for toy dialect operations. This operation inherits from the base +// `Op` class in OpBase.td, and provides: +// * The parent dialect of the operation. +// * The mnemonic for the operation, or the name without the dialect prefix. +// * A list of traits for the operation. +class Toy_Op traits = []> : + Op; + +//===----------------------------------------------------------------------===// +// Toy Operations +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ConstantOp +//===----------------------------------------------------------------------===// + +// We define a toy operation by inheriting from our base 'Toy_Op' class above. +// Here we provide the mnemonic and a list of traits for the operation. The +// constant operation is marked as 'Pure' as it is a pure operation +// and may be removed if dead. +def ConstantOp : Toy_Op<"constant", [Pure]> { + // Provide a summary and description for this operation. This can be used to + // auto-generate documentation of the operations within our dialect. + let summary = "constant"; + let description = [{ + Constant operation turns a literal into an SSA value. The data is attached + to the operation as an attribute. For example: + + ```mlir + %0 = toy.constant dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> + : tensor<2x3xf32> + ``` + }]; + + // The constant operation takes an attribute as the only input. + let arguments = (ins F32ElementsAttr:$value); + + // The constant operation returns a single value of TensorType. + let results = (outs F32Tensor); + + // Indicate that the operation has a custom parser and printer method. + let hasCustomAssemblyFormat = 1; + + // Add custom build methods for the constant operation. These method populates + // the `state` that MLIR uses to create operations, i.e. these are used when + // using `ConstantOp::create(builder, ...)`. + let builders = [ + // Build a constant with a given constant tensor value. + OpBuilder<(ins "DenseElementsAttr":$value), [{ + build($_builder, $_state, value.getType(), value); + }]>, + + // Build a constant with a given constant floating-point value. + OpBuilder<(ins "float":$value)> + ]; + + // Indicate that additional verification for this operation is necessary. + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// AddOp +//===----------------------------------------------------------------------===// + +def AddOp : Toy_Op<"add", + [Pure, DeclareOpInterfaceMethods]> { + let summary = "element-wise addition operation"; + let description = [{ + The "add" operation performs element-wise addition between two tensors. + The shapes of the tensor operands are expected to match. + }]; + + let arguments = (ins F32Tensor:$lhs, F32Tensor:$rhs); + let results = (outs F32Tensor); + + // Indicate that the operation has a custom parser and printer method. + let hasCustomAssemblyFormat = 1; + + // Allow building an AddOp with from the two input operands. + let builders = [ + OpBuilder<(ins "Value":$lhs, "Value":$rhs)> + ]; +} + +//===----------------------------------------------------------------------===// +// CastOp +//===----------------------------------------------------------------------===// + +def CastOp : Toy_Op<"cast", [ + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + Pure, + SameOperandsAndResultShape + ]> { + let summary = "shape cast operation"; + let description = [{ + The "cast" operation converts a tensor from one type to an equivalent type + without changing any data elements. The source and destination types must + both be tensor types with the same element type. If both are ranked, then + shape is required to match. The operation is invalid if converting to a + mismatching constant dimension. + }]; + + let arguments = (ins F32Tensor:$input); + let results = (outs F32Tensor:$output); + + let assemblyFormat = "$input attr-dict `:` type($input) `to` type($output)"; +} + +//===----------------------------------------------------------------------===// +// FuncOp +//===----------------------------------------------------------------------===// + +def FuncOp : Toy_Op<"func", [ + FunctionOpInterface, IsolatedFromAbove + ]> { + let summary = "user defined function operation"; + let description = [{ + The "toy.func" operation represents a user defined function. These are + callable SSA-region operations that contain toy computations. + + Example: + + ```mlir + toy.func @main() { + %0 = toy.constant dense<5.500000e+00> : tensor + %1 = toy.reshape(%0 : tensor) to tensor<2x2xf32> + toy.print %1 : tensor<2x2xf32> + toy.return + } + ``` + }]; + + let arguments = (ins + SymbolNameAttr:$sym_name, + TypeAttrOf:$function_type, + OptionalAttr:$arg_attrs, + OptionalAttr:$res_attrs + ); + let regions = (region AnyRegion:$body); + + let builders = [OpBuilder<(ins + "StringRef":$name, "FunctionType":$type, + CArg<"ArrayRef", "{}">:$attrs) + >]; + let extraClassDeclaration = [{ + //===------------------------------------------------------------------===// + // FunctionOpInterface Methods + //===------------------------------------------------------------------===// + + /// Returns the argument types of this function. + ArrayRef getArgumentTypes() { return getFunctionType().getInputs(); } + + /// Returns the result types of this function. + ArrayRef getResultTypes() { return getFunctionType().getResults(); } + + /// Returns the region on the function operation that is callable. + Region *getCallableRegion() { return &getBody(); } + }]; + let hasCustomAssemblyFormat = 1; + let skipDefaultBuilders = 1; +} + +//===----------------------------------------------------------------------===// +// GenericCallOp +//===----------------------------------------------------------------------===// + +def GenericCallOp : Toy_Op<"generic_call", + [DeclareOpInterfaceMethods]> { + let summary = "generic call operation"; + let description = [{ + Generic calls represent calls to a user defined function that needs to + be specialized for the shape of its arguments. The callee name is attached + as a symbol reference via an attribute. The arguments list must match the + arguments expected by the callee. For example: + + ```mlir + %4 = toy.generic_call @my_func(%1, %3) + : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32> + ``` + + This is only valid if a function named "my_func" exists and takes two + arguments. + }]; + + // The generic call operation takes a symbol reference attribute as the + // callee, and inputs for the call. + let arguments = (ins + FlatSymbolRefAttr:$callee, + Variadic:$inputs, + OptionalAttr:$arg_attrs, + OptionalAttr:$res_attrs + ); + + // The generic call operation returns a single value of TensorType. + let results = (outs F32Tensor); + + // Specialize assembly printing and parsing using a declarative format. + let assemblyFormat = [{ + $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results) + }]; + + // Add custom build methods for the generic call operation. + let builders = [ + OpBuilder<(ins "StringRef":$callee, "ArrayRef":$arguments)> + ]; +} + +//===----------------------------------------------------------------------===// +// MulOp +//===----------------------------------------------------------------------===// + +def MulOp : Toy_Op<"mul", + [Pure, DeclareOpInterfaceMethods]> { + let summary = "element-wise multiplication operation"; + let description = [{ + The "mul" operation performs element-wise multiplication between two + tensors. The shapes of the tensor operands are expected to match. + }]; + + let arguments = (ins F32Tensor:$lhs, F32Tensor:$rhs); + let results = (outs F32Tensor); + + // Indicate that the operation has a custom parser and printer method. + let hasCustomAssemblyFormat = 1; + + // Allow building a MulOp with from the two input operands. + let builders = [ + OpBuilder<(ins "Value":$lhs, "Value":$rhs)> + ]; +} + +//===----------------------------------------------------------------------===// +// PrintOp +//===----------------------------------------------------------------------===// + +def PrintOp : Toy_Op<"print"> { + let summary = "print operation"; + let description = [{ + The "print" builtin operation prints a given input tensor, and produces + no results. + }]; + + // The print operation takes an input tensor to print. + // We also allow a F32MemRef to enable interop during partial lowering. + let arguments = (ins AnyTypeOf<[F32Tensor, F32MemRef]>:$input); + + let assemblyFormat = "$input attr-dict `:` type($input)"; +} + +//===----------------------------------------------------------------------===// +// ReshapeOp +//===----------------------------------------------------------------------===// + +def ReshapeOp : Toy_Op<"reshape", [Pure]> { + let summary = "tensor reshape operation"; + let description = [{ + Reshape operation is transforming its input tensor into a new tensor with + the same number of elements but different shapes. For example: + + ```mlir + %0 = toy.reshape (%arg1 : tensor<10xf32>) to tensor<5x2xf32> + ``` + }]; + + let arguments = (ins F32Tensor:$input); + + let assemblyFormat = [{ + `(` $input `:` type($input) `)` attr-dict `to` type(results) + }]; + + // Enable registering canonicalization patterns with this operation. + let hasCanonicalizer = 1; + + // We expect that the reshape operation returns a statically shaped tensor. + let results = (outs StaticShapeTensorOf<[F32]>); +} + +//===----------------------------------------------------------------------===// +// ReturnOp +//===----------------------------------------------------------------------===// + +def ReturnOp : Toy_Op<"return", [Pure, HasParent<"FuncOp, GPUFuncOp">, + Terminator]> { + let summary = "return operation"; + let description = [{ + The "return" operation represents a return operation within a function. + The operation takes an optional tensor operand and produces no results. + The operand type must match the signature of the function that contains + the operation. For example: + + ```mlir + toy.func @foo() -> tensor<2xf32> { + ... + toy.return %0 : tensor<2xf32> + } + ``` + }]; + + // The return operation takes an optional input operand to return. This + // value must match the return type of the enclosing function. + let arguments = (ins Variadic:$input); + + // The return operation only emits the input in the format if it is present. + let assemblyFormat = "($input^ `:` type($input))? attr-dict "; + + // Allow building a ReturnOp with no return operand. + let builders = [ + OpBuilder<(ins), [{ build($_builder, $_state, {}); }]> + ]; + + // Provide extra utility definitions on the c++ operation class definition. + let extraClassDeclaration = [{ + bool hasOperand() { return getNumOperands() != 0; } + }]; + + // Indicate that additional verification for this operation is necessary. + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// TransposeOp +//===----------------------------------------------------------------------===// + +def TransposeOp : Toy_Op<"transpose", + [Pure, DeclareOpInterfaceMethods]> { + let summary = "transpose operation"; + + let arguments = (ins F32Tensor:$input); + let results = (outs F32Tensor); + + let assemblyFormat = [{ + `(` $input `:` type($input) `)` attr-dict `to` type(results) + }]; + + // Enable registering canonicalization patterns with this operation. + let hasCanonicalizer = 1; + + // Allow building a TransposeOp with from the input operand. + let builders = [ + OpBuilder<(ins "Value":$input)> + ]; + + // Indicate that additional verification for this operation is necessary. + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// MatMul Op +//===----------------------------------------------------------------------===// + +def MatMulOp : Toy_Op<"matmul", + [Pure, DeclareOpInterfaceMethods, MemoryEffectsOpInterface]> { + let summary = "matrix multiplication operation"; + let description = [{ + The "matmul" operation performs Matrix multiplication between two + tensors. The shapes of the tensor operands are expected to match. + }]; + + let arguments = (ins F32Tensor:$lhs, F32Tensor:$rhs); + let results = (outs Res, + MemAlloc]>:$output); + + let assemblyFormat = [{ + `(` $lhs `:` type($lhs) `,` $rhs `:` type($rhs) `)` attr-dict `to` type(results) + }]; + + // Allow building a MatMulOp with from the two input operands. + let builders = [ + OpBuilder<(ins "Value":$lhs, "Value":$rhs)> + ]; + + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// lauch GPU Op +//===----------------------------------------------------------------------===// + +def LaunchGpuOp : Toy_Op<"launch_gpu", + [DeclareOpInterfaceMethods]> { + let summary = "launch gpu kernel operation"; + let description = [{ + The "launch_gpu" operation launches a GPU kernel with given grid + dimensions. + + ```mlir + %4 = toy.launch_gpu @my_func(%1, %3) {grid = [16, 16, 1]} + : (tensor<2x3xf32>, tensor<2x3xf32>) + ``` + + }]; + + let arguments = (ins + FlatSymbolRefAttr:$callee, + Variadic:$inputs, + OptionalAttr:$arg_attrs, + OptionalAttr:$res_attrs + ); + + let results = (outs Variadic:$results); + + let assemblyFormat = [{ + $callee `(` $inputs `)` attr-dict `:` functional-type($inputs, results) + }]; + + let builders = [ + OpBuilder<(ins "StringRef":$callee, "ArrayRef":$arguments)> + ]; +} + +//===----------------------------------------------------------------------===// +// GPUFuncOp +//===----------------------------------------------------------------------===// + +def GPUFuncOp : Toy_Op<"gpu_func", [ + FunctionOpInterface, IsolatedFromAbove + ]> { + let summary = "GPU kernel function operation"; + let description = [{ + The "toy.gpu_func" operation represents a GPU kernel function. These are + callable SSA-region operations that contain toy computations to be run on + the GPU. + + Example: + + ```mlir + toy.gpu_func @my_kernel(tensor<*xf32> %arg0, tensor<*xf32> %arg1) { + ... + toy.return + } + ``` + }]; + + let arguments = (ins + SymbolNameAttr:$sym_name, + TypeAttrOf:$function_type, + OptionalAttr:$arg_attrs, + OptionalAttr:$res_attrs + ); + + let regions = (region AnyRegion:$body); + + let builders = [OpBuilder<(ins + "StringRef":$name, "FunctionType":$type, + CArg<"ArrayRef", "{}">:$attrs) + >]; + let extraClassDeclaration = [{ + //===------------------------------------------------------------------===// + // FunctionOpInterface Methods + //===------------------------------------------------------------------===// + + /// Returns the argument types of this function. + ArrayRef getArgumentTypes() { return getFunctionType().getInputs(); } + + /// Returns the result types of this function. + ArrayRef getResultTypes() { return getFunctionType().getResults(); } + + /// Returns the region on the function operation that is callable. + Region *getCallableRegion() { return &getBody(); } + }]; + let hasCustomAssemblyFormat = 1; + let skipDefaultBuilders = 1; +} + +#endif // TOY_OPS diff --git a/mlir/cuda-tile/Toy/include/toy/Parser.h b/mlir/cuda-tile/Toy/include/toy/Parser.h new file mode 100644 index 0000000..1f20616 --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/Parser.h @@ -0,0 +1,489 @@ +//===- Parser.h - Toy Language Parser -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the parser for the Toy language. It processes the Token +// provided by the Lexer and returns an AST. +// +//===----------------------------------------------------------------------===// + +#ifndef TOY_PARSER_H +#define TOY_PARSER_H + +#include "toy/AST.h" +#include "toy/Lexer.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include +#include + +namespace toy { + +/// This is a simple recursive parser for the Toy language. It produces a well +/// formed AST from a stream of Token supplied by the Lexer. No semantic checks +/// or symbol resolution is performed. For example, variables are referenced by +/// string and the code could reference an undeclared variable and the parsing +/// succeeds. +class Parser { +public: + /// Create a Parser for the supplied lexer. + Parser(Lexer &lexer) : lexer(lexer) {} + + /// Parse a full Module. A module is a list of function definitions. + std::unique_ptr parseModule() { + lexer.getNextToken(); // prime the lexer + + // Parse functions one at a time and accumulate in this vector. + std::vector functions; + while (auto f = parseDefinition()) { + functions.push_back(std::move(*f)); + if (lexer.getCurToken() == tok_eof) + break; + } + // If we didn't reach EOF, there was an error during parsing + if (lexer.getCurToken() != tok_eof) + return parseError("nothing", "at end of module"); + + return std::make_unique(std::move(functions)); + } + +private: + Lexer &lexer; + + /// Parse a return statement. + /// return :== return ; | return expr ; + std::unique_ptr parseReturn() { + auto loc = lexer.getLastLocation(); + lexer.consume(tok_return); + + // return takes an optional argument + std::optional> expr; + if (lexer.getCurToken() != ';') { + expr = parseExpression(); + if (!expr) + return nullptr; + } + return std::make_unique(std::move(loc), std::move(expr)); + } + + /// Parse a literal number. + /// numberexpr ::= number + std::unique_ptr parseNumberExpr() { + auto loc = lexer.getLastLocation(); + auto result = + std::make_unique(std::move(loc), lexer.getValue()); + lexer.consume(tok_number); + return std::move(result); + } + + /// Parse a literal array expression. + /// tensorLiteral ::= [ literalList ] | number + /// literalList ::= tensorLiteral | tensorLiteral, literalList + std::unique_ptr parseTensorLiteralExpr() { + auto loc = lexer.getLastLocation(); + lexer.consume(Token('[')); + + // Hold the list of values at this nesting level. + std::vector> values; + // Hold the dimensions for all the nesting inside this level. + std::vector dims; + do { + // We can have either another nested array or a number literal. + if (lexer.getCurToken() == '[') { + values.push_back(parseTensorLiteralExpr()); + if (!values.back()) + return nullptr; // parse error in the nested array. + } else { + if (lexer.getCurToken() != tok_number) + return parseError(" or [", "in literal expression"); + values.push_back(parseNumberExpr()); + } + + // End of this list on ']' + if (lexer.getCurToken() == ']') + break; + + // Elements are separated by a comma. + if (lexer.getCurToken() != ',') + return parseError("] or ,", "in literal expression"); + + lexer.getNextToken(); // eat , + } while (true); + if (values.empty()) + return parseError("", "to fill literal expression"); + lexer.getNextToken(); // eat ] + + /// Fill in the dimensions now. First the current nesting level: + dims.push_back(values.size()); + + /// If there is any nested array, process all of them and ensure that + /// dimensions are uniform. + if (llvm::any_of(values, [](std::unique_ptr &expr) { + return llvm::isa(expr.get()); + })) { + auto *firstLiteral = llvm::dyn_cast(values.front().get()); + if (!firstLiteral) + return parseError("uniform well-nested dimensions", + "inside literal expression"); + + // Append the nested dimensions to the current level + auto firstDims = firstLiteral->getDims(); + dims.insert(dims.end(), firstDims.begin(), firstDims.end()); + + // Sanity check that shape is uniform across all elements of the list. + for (auto &expr : values) { + auto *exprLiteral = llvm::cast(expr.get()); + if (!exprLiteral) + return parseError("uniform well-nested dimensions", + "inside literal expression"); + if (exprLiteral->getDims() != firstDims) + return parseError("uniform well-nested dimensions", + "inside literal expression"); + } + } + return std::make_unique(std::move(loc), std::move(values), + std::move(dims)); + } + + /// parenexpr ::= '(' expression ')' + std::unique_ptr parseParenExpr() { + lexer.getNextToken(); // eat (. + auto v = parseExpression(); + if (!v) + return nullptr; + + if (lexer.getCurToken() != ')') + return parseError(")", "to close expression with parentheses"); + lexer.consume(Token(')')); + return v; + } + + /// identifierexpr + /// ::= identifier + /// ::= identifier '(' expression ')' + std::unique_ptr parseIdentifierExpr() { + std::string name(lexer.getId()); + + auto loc = lexer.getLastLocation(); + lexer.getNextToken(); // eat identifier. + + if (lexer.getCurToken() != '(') // Simple variable ref. + return std::make_unique(std::move(loc), name); + + // This is a function call. + lexer.consume(Token('(')); + std::vector> args; + if (lexer.getCurToken() != ')') { + while (true) { + if (auto arg = parseExpression()) + args.push_back(std::move(arg)); + else + return nullptr; + + if (lexer.getCurToken() == ')') + break; + + if (lexer.getCurToken() != ',') + return parseError(", or )", "in argument list"); + lexer.getNextToken(); + } + } + lexer.consume(Token(')')); + + // It can be a builtin call to print + if (name == "print") { + if (args.size() != 1) + return parseError("", "as argument to print()"); + + return std::make_unique(std::move(loc), std::move(args[0])); + } + + // Call to a user-defined function + return std::make_unique(std::move(loc), name, std::move(args)); + } + + /// primary + /// ::= identifierexpr + /// ::= numberexpr + /// ::= parenexpr + /// ::= tensorliteral + std::unique_ptr parsePrimary() { + switch (lexer.getCurToken()) { + default: + llvm::errs() << "unknown token '" << lexer.getCurToken() + << "' when expecting an expression\n"; + return nullptr; + case tok_identifier: + return parseIdentifierExpr(); + case tok_number: + return parseNumberExpr(); + case '(': + return parseParenExpr(); + case '[': + return parseTensorLiteralExpr(); + case ';': + return nullptr; + case '}': + return nullptr; + } + } + + /// Recursively parse the right hand side of a binary expression, the ExprPrec + /// argument indicates the precedence of the current binary operator. + /// + /// binoprhs ::= ('+' primary)* + std::unique_ptr parseBinOpRHS(int exprPrec, + std::unique_ptr lhs) { + // If this is a binop, find its precedence. + while (true) { + int tokPrec = getTokPrecedence(); + + // If this is a binop that binds at least as tightly as the current binop, + // consume it, otherwise we are done. + if (tokPrec < exprPrec) + return lhs; + + // Okay, we know this is a binop. + int binOp = lexer.getCurToken(); + lexer.consume(Token(binOp)); + auto loc = lexer.getLastLocation(); + + // Parse the primary expression after the binary operator. + auto rhs = parsePrimary(); + if (!rhs) + return parseError("expression", "to complete binary operator"); + + // If BinOp binds less tightly with rhs than the operator after rhs, let + // the pending operator take rhs as its lhs. + int nextPrec = getTokPrecedence(); + if (tokPrec < nextPrec) { + rhs = parseBinOpRHS(tokPrec + 1, std::move(rhs)); + if (!rhs) + return nullptr; + } + + // Merge lhs/RHS. + lhs = std::make_unique(std::move(loc), binOp, + std::move(lhs), std::move(rhs)); + } + } + + /// expression::= primary binop rhs + std::unique_ptr parseExpression() { + auto lhs = parsePrimary(); + if (!lhs) + return nullptr; + + return parseBinOpRHS(0, std::move(lhs)); + } + + /// type ::= < shape_list > + /// shape_list ::= num | num , shape_list + std::unique_ptr parseType() { + if (lexer.getCurToken() != '<') + return parseError("<", "to begin type"); + lexer.getNextToken(); // eat < + + auto type = std::make_unique(); + + while (lexer.getCurToken() == tok_number) { + type->shape.push_back(lexer.getValue()); + lexer.getNextToken(); + if (lexer.getCurToken() == ',') + lexer.getNextToken(); + } + + if (lexer.getCurToken() != '>') + return parseError(">", "to end type"); + lexer.getNextToken(); // eat > + return type; + } + + /// Parse a variable declaration, it starts with a `var` keyword followed by + /// and identifier and an optional type (shape specification) before the + /// initializer. + /// decl ::= var identifier [ type ] = expr + std::unique_ptr parseDeclaration() { + if (lexer.getCurToken() != tok_var) + return parseError("var", "to begin declaration"); + auto loc = lexer.getLastLocation(); + lexer.getNextToken(); // eat var + + if (lexer.getCurToken() != tok_identifier) + return parseError("identified", + "after 'var' declaration"); + std::string id(lexer.getId()); + lexer.getNextToken(); // eat id + + std::unique_ptr type; // Type is optional, it can be inferred + if (lexer.getCurToken() == '<') { + type = parseType(); + if (!type) + return nullptr; + } + + if (!type) + type = std::make_unique(); + lexer.consume(Token('=')); + auto expr = parseExpression(); + return std::make_unique(std::move(loc), std::move(id), + std::move(*type), std::move(expr)); + } + + /// Parse a block: a list of expression separated by semicolons and wrapped in + /// curly braces. + /// + /// block ::= { expression_list } + /// expression_list ::= block_expr ; expression_list + /// block_expr ::= decl | "return" | expr + std::unique_ptr parseBlock() { + if (lexer.getCurToken() != '{') + return parseError("{", "to begin block"); + lexer.consume(Token('{')); + + auto exprList = std::make_unique(); + + // Ignore empty expressions: swallow sequences of semicolons. + while (lexer.getCurToken() == ';') + lexer.consume(Token(';')); + + while (lexer.getCurToken() != '}' && lexer.getCurToken() != tok_eof) { + if (lexer.getCurToken() == tok_var) { + // Variable declaration + auto varDecl = parseDeclaration(); + if (!varDecl) + return nullptr; + exprList->push_back(std::move(varDecl)); + } else if (lexer.getCurToken() == tok_return) { + // Return statement + auto ret = parseReturn(); + if (!ret) + return nullptr; + exprList->push_back(std::move(ret)); + } else { + // General expression + auto expr = parseExpression(); + if (!expr) + return nullptr; + exprList->push_back(std::move(expr)); + } + // Ensure that elements are separated by a semicolon. + if (lexer.getCurToken() != ';') + return parseError(";", "after expression"); + + // Ignore empty expressions: swallow sequences of semicolons. + while (lexer.getCurToken() == ';') + lexer.consume(Token(';')); + } + + if (lexer.getCurToken() != '}') + return parseError("}", "to close block"); + + lexer.consume(Token('}')); + return exprList; + } + + /// prototype ::= def id '(' decl_list ')' + /// decl_list ::= identifier | identifier, decl_list + std::unique_ptr parsePrototype() { + auto loc = lexer.getLastLocation(); + + if (lexer.getCurToken() != tok_def) + return parseError("def", "in prototype"); + lexer.consume(tok_def); + + if (lexer.getCurToken() != tok_identifier) + return parseError("function name", "in prototype"); + + std::string fnName(lexer.getId()); + lexer.consume(tok_identifier); + + if (lexer.getCurToken() != '(') + return parseError("(", "in prototype"); + lexer.consume(Token('(')); + + std::vector> args; + if (lexer.getCurToken() != ')') { + do { + std::string name(lexer.getId()); + auto loc = lexer.getLastLocation(); + lexer.consume(tok_identifier); + auto decl = std::make_unique(std::move(loc), name); + args.push_back(std::move(decl)); + if (lexer.getCurToken() != ',') + break; + lexer.consume(Token(',')); + if (lexer.getCurToken() != tok_identifier) + return parseError( + "identifier", "after ',' in function parameter list"); + } while (true); + } + if (lexer.getCurToken() != ')') + return parseError(")", "to end function prototype"); + + // success. + lexer.consume(Token(')')); + return std::make_unique(std::move(loc), fnName, + std::move(args)); + } + + /// Parse a function definition, we expect a prototype initiated with the + /// `def` keyword, followed by a block containing a list of expressions. + /// + /// definition ::= prototype block + std::unique_ptr parseDefinition() { + auto proto = parsePrototype(); + if (!proto) + return nullptr; + + if (auto block = parseBlock()) + return std::make_unique(std::move(proto), std::move(block)); + return nullptr; + } + + /// Get the precedence of the pending binary operator token. + int getTokPrecedence() { + if (!isascii(lexer.getCurToken())) + return -1; + + // 1 is lowest precedence. + switch (static_cast(lexer.getCurToken())) { + case '-': + return 20; + case '+': + return 20; + case '*': + return 40; + default: + return -1; + } + } + + /// Helper function to signal errors while parsing, it takes an argument + /// indicating the expected token and another argument giving more context. + /// Location is retrieved from the lexer to enrich the error message. + template + std::unique_ptr parseError(T &&expected, U &&context = "") { + auto curToken = lexer.getCurToken(); + llvm::errs() << "Parse error (" << lexer.getLastLocation().line << ", " + << lexer.getLastLocation().col << "): expected '" << expected + << "' " << context << " but has Token " << curToken; + if (isprint(curToken)) + llvm::errs() << " '" << (char)curToken << "'"; + llvm::errs() << "\n"; + return nullptr; + } +}; + +} // namespace toy + +#endif // TOY_PARSER_H diff --git a/mlir/cuda-tile/Toy/include/toy/Passes.h b/mlir/cuda-tile/Toy/include/toy/Passes.h new file mode 100644 index 0000000..0b057c1 --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/Passes.h @@ -0,0 +1,44 @@ +//===- Passes.h - Toy Passes Definition -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file exposes the entry points to create compiler passes for Toy. +// +//===----------------------------------------------------------------------===// + +#ifndef TOY_PASSES_H +#define TOY_PASSES_H + +#include +#include + +namespace mlir { +class Pass; + +namespace toy { +std::unique_ptr createShapeInferencePass(); + +/// Create a pass for lowering to operations in the `Affine` and `Std` dialects, +/// for a subset of the Toy IR (e.g. matmul). +std::unique_ptr createLowerToAffinePass(); + +/// Create a pass for lowering operations the remaining `Toy` operations, as +/// well as `Affine` and `Std`, to the LLVM dialect for codegen. +std::unique_ptr createLowerToLLVMPass(); + +std::unique_ptr createGpuOutlinePass(std::string grid = "1,1,1"); + +std::unique_ptr createCudaTileLoweringPass(); + +std::unique_ptr +createEmbedCudaTileBinaryPass(std::string tileirasExe = "tileiras", + std::string gpuName = "sm_120"); + +} // namespace toy +} // namespace mlir + +#endif // TOY_PASSES_H diff --git a/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.h b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.h new file mode 100644 index 0000000..cfe5a87 --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.h @@ -0,0 +1,28 @@ +//===- ShapeInferenceInterface.h - Interface definitions for ShapeInference -=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the shape inference interfaces defined +// in ShapeInferenceInterface.td. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_ +#define MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_ + +#include "mlir/IR/OpDefinition.h" + +namespace mlir { +namespace toy { + +/// Include the auto-generated declarations. +#include "toy/ShapeInferenceOpInterfaces.h.inc" + +} // namespace toy +} // namespace mlir + +#endif // MLIR_TUTORIAL_TOY_SHAPEINFERENCEINTERFACE_H_ diff --git a/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.td b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.td new file mode 100644 index 0000000..2279015 --- /dev/null +++ b/mlir/cuda-tile/Toy/include/toy/ShapeInferenceInterface.td @@ -0,0 +1,30 @@ +//===- ShapeInferenceInterface.td - Shape Inference Interface -*- tablegen -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the operations of the Shape Inference Op Interface. +// +//===----------------------------------------------------------------------===// + +#ifndef SHAPE_INFERENCE_INTERFACE +#define SHAPE_INFERENCE_INTERFACE + +include "mlir/IR/OpBase.td" + +def ShapeInferenceOpInterface : OpInterface<"ShapeInference"> { + let description = [{ + Interface to access a registered method to infer the return types for an + operation that can be used during type inference. + }]; + + let methods = [ + InterfaceMethod<"Infer and set the output shape for the current operation.", + "void", "inferShapes"> + ]; +} + +#endif // SHAPE_INFERENCE_INTERFACE diff --git a/mlir/cuda-tile/Toy/mlir/Dialect.cpp b/mlir/cuda-tile/Toy/mlir/Dialect.cpp new file mode 100644 index 0000000..a1dca39 --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/Dialect.cpp @@ -0,0 +1,572 @@ +//===- Dialect.cpp - Toy IR Dialect registration in MLIR ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the dialect for the Toy IR: custom type parsing and +// operation verification. +// +//===----------------------------------------------------------------------===// + +#include "toy/Dialect.h" + +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/Interfaces/CallInterfaces.h" +#include "mlir/Interfaces/FunctionImplementation.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/InliningUtils.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include +#include +#include +#include + +using namespace mlir; +using namespace mlir::toy; + +#include "toy/Dialect.cpp.inc" + +//===----------------------------------------------------------------------===// +// ToyInlinerInterface +//===----------------------------------------------------------------------===// + +/// This class defines the interface for handling inlining with Toy +/// operations. +struct ToyInlinerInterface : public DialectInlinerInterface { + using DialectInlinerInterface::DialectInlinerInterface; + + //===--------------------------------------------------------------------===// + // Analysis Hooks + //===--------------------------------------------------------------------===// + + /// All call operations within toy can be inlined. + bool isLegalToInline(Operation *call, Operation *callable, + bool wouldBeCloned) const final { + return true; + } + + /// All operations within toy can be inlined. + bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final { + return true; + } + + // All functions within toy can be inlined. + bool isLegalToInline(Region *, Region *, bool, IRMapping &) const final { + return true; + } + + //===--------------------------------------------------------------------===// + // Transformation Hooks + //===--------------------------------------------------------------------===// + + /// Handle the given inlined terminator(toy.return) by replacing it with a new + /// operation as necessary. + void handleTerminator(Operation *op, ValueRange valuesToRepl) const final { + // Only "toy.return" needs to be handled here. + auto returnOp = cast(op); + + // Replace the values directly with the return operands. + assert(returnOp.getNumOperands() == valuesToRepl.size()); + for (const auto &it : llvm::enumerate(returnOp.getOperands())) + valuesToRepl[it.index()].replaceAllUsesWith(it.value()); + } + + /// Attempts to materialize a conversion for a type mismatch between a call + /// from this dialect, and a callable region. This method should generate an + /// operation that takes 'input' as the only operand, and produces a single + /// result of 'resultType'. If a conversion can not be generated, nullptr + /// should be returned. + Operation *materializeCallConversion(OpBuilder &builder, Value input, + Type resultType, + Location conversionLoc) const final { + return CastOp::create(builder, conversionLoc, resultType, input); + } +}; + +//===----------------------------------------------------------------------===// +// ToyDialect +//===----------------------------------------------------------------------===// + +/// Dialect initialization, the instance will be owned by the context. This is +/// the point of registration of types and operations for the dialect. +void ToyDialect::initialize() { + addOperations< +#define GET_OP_LIST +#include "toy/Ops.cpp.inc" + >(); + addInterfaces(); +} + +//===----------------------------------------------------------------------===// +// Toy Operations +//===----------------------------------------------------------------------===// + +/// A generalized parser for binary operations. This parses the different forms +/// of 'printBinaryOp' below. +static mlir::ParseResult parseBinaryOp(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + SmallVector operands; + SMLoc operandsLoc = parser.getCurrentLocation(); + Type type; + if (parser.parseOperandList(operands, /*requiredOperandCount=*/2) || + parser.parseOptionalAttrDict(result.attributes) || + parser.parseColonType(type)) + return mlir::failure(); + + // If the type is a function type, it contains the input and result types of + // this operation. + if (FunctionType funcType = llvm::dyn_cast(type)) { + if (parser.resolveOperands(operands, funcType.getInputs(), operandsLoc, + result.operands)) + return mlir::failure(); + result.addTypes(funcType.getResults()); + return mlir::success(); + } + + // Otherwise, the parsed type is the type of both operands and results. + if (parser.resolveOperands(operands, type, result.operands)) + return mlir::failure(); + result.addTypes(type); + return mlir::success(); +} + +/// A generalized printer for binary operations. It prints in two different +/// forms depending on if all of the types match. +static void printBinaryOp(mlir::OpAsmPrinter &printer, mlir::Operation *op) { + printer << " " << op->getOperands(); + printer.printOptionalAttrDict(op->getAttrs()); + printer << " : "; + + // If all of the types are the same, print the type directly. + Type resultType = *op->result_type_begin(); + if (llvm::all_of(op->getOperandTypes(), + [=](Type type) { return type == resultType; })) { + printer << resultType; + return; + } + + // Otherwise, print a functional type. + printer.printFunctionalType(op->getOperandTypes(), op->getResultTypes()); +} + +//===----------------------------------------------------------------------===// +// ConstantOp +//===----------------------------------------------------------------------===// + +/// Build a constant operation. +/// The builder is passed as an argument, so is the state that this method is +/// expected to fill in order to build the operation. +void ConstantOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + float value) { + auto dataType = RankedTensorType::get({}, builder.getF32Type()); + auto dataAttribute = DenseElementsAttr::get(dataType, value); + ConstantOp::build(builder, state, dataType, dataAttribute); +} + +/// The 'OpAsmParser' class provides a collection of methods for parsing +/// various punctuation, as well as attributes, operands, types, etc. Each of +/// these methods returns a `ParseResult`. This class is a wrapper around +/// `LogicalResult` that can be converted to a boolean `true` value on failure, +/// or `false` on success. This allows for easily chaining together a set of +/// parser rules. These rules are used to populate an `mlir::OperationState` +/// similarly to the `build` methods described above. +mlir::ParseResult ConstantOp::parse(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + mlir::DenseElementsAttr value; + if (parser.parseOptionalAttrDict(result.attributes) || + parser.parseAttribute(value, "value", result.attributes)) + return failure(); + + result.addTypes(value.getType()); + return success(); +} + +/// The 'OpAsmPrinter' class is a stream that allows for formatting +/// strings, attributes, operands, types, etc. +void ConstantOp::print(mlir::OpAsmPrinter &printer) { + printer << " "; + printer.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{"value"}); + printer << getValue(); +} + +/// Verifier for the constant operation. This corresponds to the +/// `let hasVerifier = 1` in the op definition. +llvm::LogicalResult ConstantOp::verify() { + // If the return type of the constant is not an unranked tensor, the shape + // must match the shape of the attribute holding the data. + auto resultType = + llvm::dyn_cast(getResult().getType()); + if (!resultType) + return success(); + + // Check that the rank of the attribute type matches the rank of the constant + // result type. + auto attrType = llvm::cast(getValue().getType()); + if (attrType.getRank() != resultType.getRank()) { + return emitOpError("return type must match the one of the attached value " + "attribute: ") + << attrType.getRank() << " != " << resultType.getRank(); + } + + // Check that each of the dimensions match between the two types. + for (int dim = 0, dimE = attrType.getRank(); dim < dimE; ++dim) { + if (attrType.getShape()[dim] != resultType.getShape()[dim]) { + return emitOpError( + "return type shape mismatches its attribute at dimension ") + << dim << ": " << attrType.getShape()[dim] + << " != " << resultType.getShape()[dim]; + } + } + return mlir::success(); +} + +//===----------------------------------------------------------------------===// +// AddOp +//===----------------------------------------------------------------------===// + +void AddOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + mlir::Value lhs, mlir::Value rhs) { + state.addTypes(UnrankedTensorType::get(builder.getF32Type())); + state.addOperands({lhs, rhs}); +} + +mlir::ParseResult AddOp::parse(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + return parseBinaryOp(parser, result); +} + +void AddOp::print(mlir::OpAsmPrinter &p) { printBinaryOp(p, *this); } + +/// Infer the output shape of the AddOp, this is required by the shape inference +/// interface. +void AddOp::inferShapes() { getResult().setType(getLhs().getType()); } + +//===----------------------------------------------------------------------===// +// CastOp +//===----------------------------------------------------------------------===// + +/// Infer the output shape of the CastOp, this is required by the shape +/// inference interface. +void CastOp::inferShapes() { getResult().setType(getInput().getType()); } + +/// Returns true if the given set of input and result types are compatible with +/// this cast operation. This is required by the `CastOpInterface` to verify +/// this operation and provide other additional utilities. +bool CastOp::areCastCompatible(TypeRange inputs, TypeRange outputs) { + if (inputs.size() != 1 || outputs.size() != 1) + return false; + // The inputs must be Tensors with the same element type. + TensorType input = llvm::dyn_cast(inputs.front()); + TensorType output = llvm::dyn_cast(outputs.front()); + if (!input || !output || input.getElementType() != output.getElementType()) + return false; + // The shape is required to match if both types are ranked. + return !input.hasRank() || !output.hasRank() || input == output; +} + +//===----------------------------------------------------------------------===// +// FuncOp +//===----------------------------------------------------------------------===// + +void FuncOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + llvm::StringRef name, mlir::FunctionType type, + llvm::ArrayRef attrs) { + // FunctionOpInterface provides a convenient `build` method that will populate + // the state of our FuncOp, and create an entry block. + buildWithEntryBlock(builder, state, name, type, attrs, type.getInputs()); +} + +mlir::ParseResult FuncOp::parse(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + // Dispatch to the FunctionOpInterface provided utility method that parses the + // function operation. + auto buildFuncType = + [](mlir::Builder &builder, llvm::ArrayRef argTypes, + llvm::ArrayRef results, + mlir::function_interface_impl::VariadicFlag, + std::string &) { return builder.getFunctionType(argTypes, results); }; + + return mlir::function_interface_impl::parseFunctionOp( + parser, result, /*allowVariadic=*/false, + getFunctionTypeAttrName(result.name), buildFuncType, + getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name)); +} + +void FuncOp::print(mlir::OpAsmPrinter &p) { + // Dispatch to the FunctionOpInterface provided utility method that prints the + // function operation. + mlir::function_interface_impl::printFunctionOp( + p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(), + getArgAttrsAttrName(), getResAttrsAttrName()); +} + +//===----------------------------------------------------------------------===// +// GenericCallOp +//===----------------------------------------------------------------------===// + +void GenericCallOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + StringRef callee, ArrayRef arguments) { + // Generic call always returns an unranked Tensor initially. + state.addTypes(UnrankedTensorType::get(builder.getF32Type())); + state.addOperands(arguments); + state.addAttribute("callee", + mlir::SymbolRefAttr::get(builder.getContext(), callee)); +} + +/// Return the callee of the generic call operation, this is required by the +/// call interface. +CallInterfaceCallable GenericCallOp::getCallableForCallee() { + return (*this)->getAttrOfType("callee"); +} + +/// Set the callee for the generic call operation, this is required by the call +/// interface. +void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) { + (*this)->setAttr("callee", cast(callee)); +} + +/// Get the argument operands to the called function, this is required by the +/// call interface. +Operation::operand_range GenericCallOp::getArgOperands() { return getInputs(); } + +/// Get the argument operands to the called function as a mutable range, this is +/// required by the call interface. +MutableOperandRange GenericCallOp::getArgOperandsMutable() { + return getInputsMutable(); +} + +//===----------------------------------------------------------------------===// +// MulOp +//===----------------------------------------------------------------------===// + +void MulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + mlir::Value lhs, mlir::Value rhs) { + state.addTypes(UnrankedTensorType::get(builder.getF32Type())); + state.addOperands({lhs, rhs}); +} + +mlir::ParseResult MulOp::parse(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + return parseBinaryOp(parser, result); +} + +void MulOp::print(mlir::OpAsmPrinter &p) { printBinaryOp(p, *this); } + +/// Infer the output shape of the MulOp, this is required by the shape inference +/// interface. +void MulOp::inferShapes() { getResult().setType(getLhs().getType()); } + +//===----------------------------------------------------------------------===// +// ReturnOp +//===----------------------------------------------------------------------===// + +llvm::LogicalResult ReturnOp::verify() { + // Parent can be FuncOp or GPUFuncOp; both implement FunctionOpInterface. + auto *parent = (*this)->getParentOp(); + auto function = dyn_cast(parent); + if (!function) + return emitOpError() << "must be enclosed in a function-like op"; + + /// ReturnOps can only have a single optional operand. + if (getNumOperands() > 1) + return emitOpError() << "expects at most 1 return operand"; + + // The operand number and types must match the function signature. + auto funcType = llvm::cast(function.getFunctionType()); + const auto &results = funcType.getResults(); + if (getNumOperands() != results.size()) + return emitOpError() << "does not return the same number of values (" + << getNumOperands() << ") as the enclosing function (" + << results.size() << ")"; + + // If the operation does not have an input, we are done. + if (!hasOperand()) + return mlir::success(); + + auto inputType = *operand_type_begin(); + auto resultType = results.front(); + + // Check that the result type of the function matches the operand type. + if (inputType == resultType || + llvm::isa(inputType) || + llvm::isa(resultType)) + return mlir::success(); + + return emitError() << "type of return operand (" << inputType + << ") doesn't match function result type (" << resultType + << ")"; +} + +//===----------------------------------------------------------------------===// +// TransposeOp +//===----------------------------------------------------------------------===// + +void TransposeOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + mlir::Value value) { + state.addTypes(UnrankedTensorType::get(builder.getF32Type())); + state.addOperands(value); +} + +void TransposeOp::inferShapes() { + auto arrayTy = llvm::cast(getOperand().getType()); + SmallVector dims(llvm::reverse(arrayTy.getShape())); + getResult().setType(RankedTensorType::get(dims, arrayTy.getElementType())); +} + +llvm::LogicalResult TransposeOp::verify() { + auto inputType = llvm::dyn_cast(getOperand().getType()); + auto resultType = llvm::dyn_cast(getType()); + if (!inputType || !resultType) + return mlir::success(); + + auto inputShape = inputType.getShape(); + if (!std::equal(inputShape.begin(), inputShape.end(), + resultType.getShape().rbegin())) { + return emitError() + << "expected result shape to be a transpose of the input"; + } + return mlir::success(); +} + +//===----------------------------------------------------------------------===// +// MatMulOp +//===----------------------------------------------------------------------===// + +void MatMulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + mlir::Value lhs, mlir::Value rhs) { + state.addTypes(UnrankedTensorType::get(builder.getF32Type())); + state.addOperands({lhs, rhs}); +} + +/// Infer the output shape of the MatMulOp, this is required by the shape +/// inference interface. +void MatMulOp::inferShapes() { + RankedTensorType lhsType = + llvm::dyn_cast(getLhs().getType()); + RankedTensorType rhsType = + llvm::dyn_cast(getRhs().getType()); + auto lhsShape = lhsType.getShape(); + auto rhsShape = rhsType.getShape(); + RankedTensorType res_type = RankedTensorType::get({lhsShape[0], rhsShape[1]}, + lhsType.getElementType()); + getResult().setType(res_type); +} + +llvm::LogicalResult MatMulOp::verify() { + auto lhsType = llvm::dyn_cast(getLhs().getType()); + auto rhsType = llvm::dyn_cast(getRhs().getType()); + auto resultType = llvm::dyn_cast(getType()); + + if (!lhsType || !rhsType || !resultType) + return mlir::success(); + + auto lhsShape = lhsType.getShape(); + auto rhsShape = rhsType.getShape(); + + if (lhsShape.size() != 2 || rhsShape.size() != 2) { + return emitOpError() << "expected 2D matrix"; + } + + if (lhsShape[1] != rhsShape[0]) { + return emitOpError() << "expected dimension to match" + << "the shape of lhs is [" << lhsShape[0] << ", " + << lhsShape[1] << "] " + << "the shape of rhs is [" << rhsShape[0] << ", " + << rhsShape[1] << "] " + << "but the dimension " << lhsShape[1] + << "!=" << rhsShape[0] << '\n'; + } + + return mlir::success(); +} + +//===----------------------------------------------------------------------===// +// LaunchGpuOp +//===----------------------------------------------------------------------===// + +void LaunchGpuOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + StringRef callee, ArrayRef arguments) { + // Generic call always returns an unranked Tensor initially. + state.addTypes(UnrankedTensorType::get(builder.getF32Type())); + state.addOperands(arguments); + state.addAttribute("callee", + mlir::SymbolRefAttr::get(builder.getContext(), callee)); + state.addAttribute("grid", builder.getI64ArrayAttr({1, 1, 1})); +} + +/// Return the callee of the generic call operation, this is required by the +/// call interface. +CallInterfaceCallable LaunchGpuOp::getCallableForCallee() { + return (*this)->getAttrOfType("callee"); +} + +/// Set the callee for the generic call operation, this is required by the call +/// interface. +void LaunchGpuOp::setCalleeFromCallable(CallInterfaceCallable callee) { + (*this)->setAttr("callee", cast(callee)); +} + +/// Get the argument operands to the called function, this is required by the +/// call interface. +Operation::operand_range LaunchGpuOp::getArgOperands() { return getInputs(); } + +/// Get the argument operands to the called function as a mutable range, this is +/// required by the call interface. +MutableOperandRange LaunchGpuOp::getArgOperandsMutable() { + return getInputsMutable(); +} + +//===----------------------------------------------------------------------===// +// GPUFuncOp +//===----------------------------------------------------------------------===// + +void GPUFuncOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, + llvm::StringRef name, mlir::FunctionType type, + llvm::ArrayRef attrs) { + // FunctionOpInterface provides a convenient `build` method that will populate + // the state of our GPUFuncOp, and create an entry block. + buildWithEntryBlock(builder, state, name, type, attrs, type.getInputs()); +} + +mlir::ParseResult GPUFuncOp::parse(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + // Dispatch to the FunctionOpInterface provided utility method that parses the + // function operation. + auto buildFuncType = + [](mlir::Builder &builder, llvm::ArrayRef argTypes, + llvm::ArrayRef results, + mlir::function_interface_impl::VariadicFlag, + std::string &) { return builder.getFunctionType(argTypes, results); }; + + return mlir::function_interface_impl::parseFunctionOp( + parser, result, /*allowVariadic=*/false, + getFunctionTypeAttrName(result.name), buildFuncType, + getArgAttrsAttrName(result.name), getResAttrsAttrName(result.name)); +} + +void GPUFuncOp::print(mlir::OpAsmPrinter &p) { + // Dispatch to the FunctionOpInterface provided utility method that prints the + // function operation. + mlir::function_interface_impl::printFunctionOp( + p, *this, /*isVariadic=*/false, getFunctionTypeAttrName(), + getArgAttrsAttrName(), getResAttrsAttrName()); +} + +//===----------------------------------------------------------------------===// +// TableGen'd op method definitions +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "toy/Ops.cpp.inc" diff --git a/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp b/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp new file mode 100644 index 0000000..2baf7a0 --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/EmitCudaTile.cpp @@ -0,0 +1,196 @@ +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Pass/Pass.h" + +#include "cuda_tile/Bytecode/Writer/BytecodeWriter.h" +#include "cuda_tile/Dialect/CudaTile/IR/Ops.h" +#include "toy/Dialect.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; +using namespace mlir; + +namespace { + +/// Read file contents as raw bytes. +static FailureOr> readFileBytes(StringRef path) { + auto bufOrErr = MemoryBuffer::getFile(path, /*IsText=*/false); + if (!bufOrErr) + return failure(); + auto &buf = *bufOrErr.get(); + std::vector out(buf.getBufferSize()); + memcpy(out.data(), buf.getBufferStart(), buf.getBufferSize()); + return out; +} + +/// Write raw bytes to a file. +static LogicalResult writeFileBytes(StringRef path, ArrayRef bytes) { + std::error_code ec; + raw_fd_ostream os(path, ec, sys::fs::OF_None); + if (ec) + return failure(); + os.write(bytes.data(), bytes.size()); + os.flush(); + return success(); +} + +/// Execute external tileiras to assemble tilebc into a binary. +static LogicalResult runTileIRAS(Operation *anchor, StringRef tileirasExe, + StringRef gpuName, StringRef inTilebc, + StringRef outBin) { + SmallVector args; + args.push_back(tileirasExe); + args.push_back("--gpu-name"); + args.push_back(gpuName); + args.push_back(inTilebc); + args.push_back("-o"); + args.push_back(outBin); + + std::string errMsg; + int rc = sys::ExecuteAndWait(tileirasExe, args, + /*env=*/std::nullopt, + /*redirects=*/{}, + /*secondsToWait=*/0, + /*memoryLimit=*/0, &errMsg); + if (rc != 0) { + return anchor->emitError() << "tileiras failed, rc=" << rc << "\n" + << errMsg; + } + return success(); +} + +std::error_code createTemporaryFile(SmallVectorImpl &inPath, + StringRef prefix, StringRef suffix) { + int inFD = -1; + if (std::error_code ec = + sys::fs::createTemporaryFile(prefix, suffix, inFD, inPath)) { + return ec; + } + + if (std::error_code ec = sys::fs::closeFile(inFD)) { + return ec; + } + return std::error_code(); +} + +struct EmbedCudaTileBinaryPass + : public PassWrapper> { + + std::string tileirasExe; + std::string gpuName; + + EmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName) + : tileirasExe(std::move(tileirasExe)), gpuName(std::move(gpuName)) {} + + void runOnOperation() override { + ModuleOp top = getOperation(); + MLIRContext *ctx = top.getContext(); + + SmallString<256> cudaBinPath; + + top.walk([&](Operation *op) { + // we assume the MLIR only have one cuda tile module. + if (op->getName().getStringRef() != "cuda_tile.module") + return; + + auto cudaMod = dyn_cast(op); + if (!cudaMod) + return; + + // ---- Step B: generate tilebc bytes in-process ---- + SmallVector tilebcBytes; + raw_svector_ostream tilebcOS(tilebcBytes); + + // Using writeBytecode API: writeBytecode(output, moduleOp, + // BytecodeVersion::kCurrentVersion) + if (failed(writeBytecode(tilebcOS, cudaMod, + cuda_tile::BytecodeVersion::kCurrentVersion))) { + op->emitError() << "writeBytecode(tilebc) failed"; + signalPassFailure(); + return; + } + + // ---- Step C: create temp files and invoke tileiras ---- + SmallString<256> inPath; + + if (std::error_code ec = + createTemporaryFile(inPath, "cuda_tile", "tilebc")) { + op->emitError() << "failed to create temp in tilebc: " << ec.message(); + signalPassFailure(); + return; + } + + if (std::error_code ec = + createTemporaryFile(cudaBinPath, "cuda_tile", "bin")) { + op->emitError() << "failed to create temp out bin: " << ec.message(); + signalPassFailure(); + return; + } + + if (failed(writeFileBytes(inPath, tilebcBytes))) { + op->emitError() << "failed to write temp tilebc"; + signalPassFailure(); + return; + } + + if (failed(runTileIRAS(op, tileirasExe, gpuName, inPath, cudaBinPath))) { + signalPassFailure(); + return; + } + }); + + top->walk([&](toy::LaunchGpuOp launchOp) { + // ---- Step D: read cuda binary bytes ---- + auto binBytesOrErr = readFileBytes(cudaBinPath); + if (failed(binBytesOrErr)) { + launchOp.emitError() << "failed to read cuda binary file"; + signalPassFailure(); + return; + } + auto binBytes = *binBytesOrErr; + + // ---- Step E: embed binary as LaunchGpuOp attributes ---- + llvm::SmallVector binU8Bytes; + binU8Bytes.reserve(binBytes.size()); + for (auto b : binBytes) + binU8Bytes.push_back(static_cast(b)); + + auto byteAttr = mlir::DenseIntElementsAttr::get( + mlir::RankedTensorType::get({static_cast(binU8Bytes.size())}, + mlir::IntegerType::get(ctx, 8)), + binU8Bytes); + + // launchOp->setAttr("cuda_binary", byteAttr); + launchOp->setAttr("cuda_binary_size", + mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 64), + binU8Bytes.size())); + launchOp->setAttr("cuda_binary_path", + mlir::StringAttr::get(ctx, cudaBinPath.str())); + launchOp->setAttr("cuda_arch", mlir::StringAttr::get(ctx, gpuName)); + }); + + // ---- Step F: Delete the cuda_tile.module ops ---- + llvm::SmallVector toErase; + top->walk([&](cuda_tile::ModuleOp op) { toErase.push_back(op); }); + + for (auto op : toErase) { + op->erase(); + } + }; +}; +} // namespace + +namespace mlir::toy { + +std::unique_ptr +createEmbedCudaTileBinaryPass(std::string tileirasExe, std::string gpuName) { + return std::make_unique(tileirasExe, gpuName); +}; + +}; // namespace mlir::toy diff --git a/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp b/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp new file mode 100644 index 0000000..3fc59c0 --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/LowerToAffineLoops.cpp @@ -0,0 +1,459 @@ +//====- LowerToAffineLoops.cpp - Partial lowering from Toy to Affine+Std --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a partial lowering of Toy operations to a combination of +// affine loops, memref operations and standard operations. This lowering +// expects that all calls have been inlined, and all shapes have been resolved. +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinDialect.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/DialectRegistry.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/TypeID.h" +#include "toy/Dialect.h" +#include "toy/Passes.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Casting.h" +#include +#include +#include +#include +#include + +using namespace mlir; + +//===----------------------------------------------------------------------===// +// ToyToAffine Conversion Patterns +//===----------------------------------------------------------------------===// + +/// Convert the given RankedTensorType into the corresponding MemRefType. +static MemRefType convertTensorToMemRef(RankedTensorType type) { + return MemRefType::get(type.getShape(), type.getElementType()); +} + +/// Insert an allocation and deallocation for the given MemRefType. +static Value insertAllocAndDealloc(MemRefType type, Location loc, + PatternRewriter &rewriter) { + auto alloc = memref::AllocOp::create(rewriter, loc, type); + + // Make sure to allocate at the beginning of the block. + auto *parentBlock = alloc->getBlock(); + alloc->moveBefore(&parentBlock->front()); + + // Make sure to deallocate this alloc at the end of the block. This is fine + // as toy functions have no control flow. + auto dealloc = memref::DeallocOp::create(rewriter, loc, alloc); + dealloc->moveBefore(&parentBlock->back()); + return alloc; +} + +/// This defines the function type used to process an iteration of a lowered +/// loop. It takes as input an OpBuilder and the range of loop induction +/// variables for the iteration. It returns a value to store at the current +/// index of the iteration. +using LoopIterationFn = + function_ref; + +static void lowerOpToLoops(Operation *op, PatternRewriter &rewriter, + LoopIterationFn processIteration) { + auto tensorType = llvm::cast((*op->result_type_begin())); + auto loc = op->getLoc(); + + // Insert an allocation and deallocation for the result of this operation. + auto memRefType = convertTensorToMemRef(tensorType); + auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter); + + // Create a nest of affine loops, with one loop per dimension of the shape. + // The buildAffineLoopNest function takes a callback that is used to construct + // the body of the innermost loop given a builder, a location and a range of + // loop induction variables. + SmallVector lowerBounds(tensorType.getRank(), /*Value=*/0); + SmallVector steps(tensorType.getRank(), /*Value=*/1); + affine::buildAffineLoopNest( + rewriter, loc, lowerBounds, tensorType.getShape(), steps, + [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { + // Call the processing function with the rewriter and the loop + // induction variables. This function will return the value to store at + // the current index. + Value valueToStore = processIteration(nestedBuilder, ivs); + affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc, + ivs); + }); + + // Replace this operation with the generated alloc. + rewriter.replaceOp(op, alloc); +} + +namespace { +//===----------------------------------------------------------------------===// +// ToyToAffine Conversion Patterns: Binary operations +//===----------------------------------------------------------------------===// + +template +struct BinaryOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + using OpAdaptor = typename OpConversionPattern::OpAdaptor; + + LogicalResult + matchAndRewrite(BinaryOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + auto loc = op->getLoc(); + lowerOpToLoops(op, rewriter, [&](OpBuilder &builder, ValueRange loopIvs) { + // Generate loads for the element of 'lhs' and 'rhs' at the + // inner loop. + auto loadedLhs = + affine::AffineLoadOp::create(builder, loc, adaptor.getLhs(), loopIvs); + auto loadedRhs = + affine::AffineLoadOp::create(builder, loc, adaptor.getRhs(), loopIvs); + + // Create the binary operation performed on the loaded + // values. + return LoweredBinaryOp::create(builder, loc, loadedLhs, loadedRhs); + }); + return success(); + } +}; +using AddOpLowering = BinaryOpLowering; +using MulOpLowering = BinaryOpLowering; + +//===----------------------------------------------------------------------===// +// ToyToAffine Conversion Patterns: Constant operations +//===----------------------------------------------------------------------===// + +struct ConstantOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(toy::ConstantOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + DenseElementsAttr constantValue = op.getValue(); + Location loc = op.getLoc(); + + // When lowering the constant operation, we allocate and assign the constant + // values to a corresponding memref allocation. + auto tensorType = llvm::cast(op.getType()); + auto memRefType = convertTensorToMemRef(tensorType); + auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter); + + // We will be generating constant indices up-to the largest dimension. + // Create these constants up-front to avoid large amounts of redundant + // operations. + auto valueShape = memRefType.getShape(); + SmallVector constantIndices; + + if (!valueShape.empty()) { + for (auto i : llvm::seq(0, *llvm::max_element(valueShape))) + constantIndices.push_back( + arith::ConstantIndexOp::create(rewriter, loc, i)); + } else { + // This is the case of a tensor of rank 0. + constantIndices.push_back( + arith::ConstantIndexOp::create(rewriter, loc, 0)); + } + + // The constant operation represents a multi-dimensional constant, so we + // will need to generate a store for each of the elements. The following + // functor recursively walks the dimensions of the constant shape, + // generating a store when the recursion hits the base case. + SmallVector indices; + auto valueIt = constantValue.value_begin(); + std::function storeElements = [&](uint64_t dimension) { + // The last dimension is the base case of the recursion, at this point + // we store the element at the given index. + if (dimension == valueShape.size()) { + affine::AffineStoreOp::create( + rewriter, loc, arith::ConstantOp::create(rewriter, loc, *valueIt++), + alloc, llvm::ArrayRef(indices)); + return; + } + + // Otherwise, iterate over the current dimension and add the indices to + // the list. + for (uint64_t i = 0, e = valueShape[dimension]; i != e; ++i) { + indices.push_back(constantIndices[i]); + storeElements(dimension + 1); + indices.pop_back(); + } + }; + + // Start the element storing recursion from the first dimension. + storeElements(/*dimension=*/0); + + // Replace this operation with the generated alloc. + rewriter.replaceOp(op, alloc); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// ToyToAffine Conversion Patterns: Func operations +//===----------------------------------------------------------------------===// + +struct FuncOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(toy::FuncOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + // We only lower the main function as we expect that all other functions + // have been inlined. + if (op.getName() != "main") + return failure(); + + // Verify that the given main has no inputs and results. + if (op.getNumArguments() || op.getFunctionType().getNumResults()) { + return rewriter.notifyMatchFailure(op, [](Diagnostic &diag) { + diag << "expected 'main' to have 0 inputs and 0 results"; + }); + } + + // Create a new non-toy function, with the same region. + auto func = mlir::func::FuncOp::create(rewriter, op.getLoc(), op.getName(), + op.getFunctionType()); + rewriter.inlineRegionBefore(op.getRegion(), func.getBody(), func.end()); + rewriter.eraseOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// ToyToAffine Conversion Patterns: Print operations +//===----------------------------------------------------------------------===// + +struct PrintOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(toy::PrintOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + // We don't lower "toy.print" in this pass, but we need to update its + // operands. + rewriter.modifyOpInPlace(op, + [&] { op->setOperands(adaptor.getOperands()); }); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// ToyToAffine Conversion Patterns: Return operations +//===----------------------------------------------------------------------===// + +struct ReturnOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(toy::ReturnOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + // During this lowering, we expect that all function calls have been + // inlined. + if (op.hasOperand()) + return failure(); + + // We lower "toy.return" directly to "func.return". + rewriter.replaceOpWithNewOp(op); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// ToyToAffine Conversion Patterns: Transpose operations +//===----------------------------------------------------------------------===// + +struct TransposeOpLowering : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(toy::TransposeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const final { + auto loc = op->getLoc(); + lowerOpToLoops(op, rewriter, [&](OpBuilder &builder, ValueRange loopIvs) { + Value input = adaptor.getInput(); + + // Transpose the elements by generating a load from the + // reverse indices. + SmallVector reverseIvs(llvm::reverse(loopIvs)); + return affine::AffineLoadOp::create(builder, loc, input, reverseIvs); + }); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// ToyToAffine RewritePatterns: MatMul operations +//===----------------------------------------------------------------------===// + +struct MatMulOpLowering : public ConversionPattern { + MatMulOpLowering(MLIRContext *ctx) + : ConversionPattern(toy::MatMulOp::getOperationName(), 1, ctx) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + auto loc = op->getLoc(); + + RankedTensorType lhsType = + llvm::dyn_cast(op->getOperand(0).getType()); + RankedTensorType rhsType = + llvm::dyn_cast(op->getOperand(1).getType()); + auto lhsShape = lhsType.getShape(); + auto rhsShape = rhsType.getShape(); + + auto tensorType = + llvm::dyn_cast((*op->result_type_begin())); + + auto elemType = llvm::dyn_cast(tensorType.getElementType()); + + // Insert an allocation and deallocation for the result of this operation. + auto memRefType = convertTensorToMemRef(tensorType); + auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter); + + SmallVector lowerBounds(tensorType.getRank() + 1, /*Value=*/0); + SmallVector steps(tensorType.getRank() + 1, /*Value=*/1); + SmallVector upperBounds{lhsShape[0], rhsShape[0], rhsShape[1]}; + + // add initialization of result tensor. + // Create a nest of affine loops to initialize the result tensor to 0. + affine::buildAffineLoopNest( + rewriter, loc, {0, 0}, tensorType.getShape(), {1, 1}, + [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { + // Create a constant float value of 0.0. + auto valueToStore = arith::ConstantFloatOp::create( + nestedBuilder, loc, elemType, + llvm::APFloat::getZero(elemType.getFloatSemantics())); + + // Store the constant value into the allocated memory. + affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc, + ivs); + }); + + // Create a nest of affine loops for matrix multiplication. + affine::buildAffineLoopNest( + rewriter, loc, lowerBounds, upperBounds, steps, + [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { + // Extract loop induction variables. + Value m = ivs[0]; + Value k = ivs[1]; + Value n = ivs[2]; + + // Create an adaptor for the remapped operands of the MatMulOp. + toy::MatMulOpAdaptor matmulAdaptor(operands); + + // Load elements from the left-hand side and right-hand side matrices. + auto loadedLhs = affine::AffineLoadOp::create( + nestedBuilder, loc, matmulAdaptor.getLhs(), ValueRange{m, k}); + + auto loadedRhs = affine::AffineLoadOp::create( + nestedBuilder, loc, matmulAdaptor.getRhs(), ValueRange{k, n}); + // Load elements from the result tensor from initial process above. + auto loadedRes = affine::AffineLoadOp::create( + nestedBuilder, loc, alloc, ValueRange{m, n}); + + // Perform the multiplication and addition operations. + auto mulop = + arith::MulFOp::create(nestedBuilder, loc, loadedLhs, loadedRhs); + auto valueToStore = + arith::AddFOp::create(nestedBuilder, loc, loadedRes, mulop); + + // Store the result back into the allocated memory. + affine::AffineStoreOp::create(nestedBuilder, loc, valueToStore, alloc, + ValueRange{m, n}); + }); + + // Replace this operation with the generated alloc. + rewriter.replaceOp(op, alloc); + + return success(); + } +}; + +} // namespace + +//===----------------------------------------------------------------------===// +// ToyToAffineLoweringPass +//===----------------------------------------------------------------------===// + +/// This is a partial lowering to affine loops of the toy operations that are +/// computationally intensive (like matmul for example...) while keeping the +/// rest of the code in the Toy dialect. +namespace { +struct ToyToAffineLoweringPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToAffineLoweringPass) + StringRef getArgument() const override { return "toy-to-affine"; } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() final; +}; +} // namespace + +void ToyToAffineLoweringPass::runOnOperation() { + // The first thing to define is the conversion target. This will define the + // final target for this lowering. + ConversionTarget target(getContext()); + + // We define the specific operations, or dialects, that are legal targets for + // this lowering. In our case, we are lowering to a combination of the + // `Affine`, `Arith`, `Func`, and `MemRef` dialects. + target.addLegalDialect(); + + // We also define the Toy dialect as Illegal so that the conversion will fail + // if any of these operations are *not* converted. Given that we actually want + // a partial lowering, we explicitly mark the Toy operations that don't want + // to lower, `toy.print`, as `legal`. `toy.print` will still need its operands + // to be updated though (as we convert from TensorType to MemRefType), so we + // only treat it as `legal` if its operands are legal. + target.addIllegalDialect(); + target.addDynamicallyLegalOp([](toy::PrintOp op) { + return llvm::none_of(op->getOperandTypes(), + [](Type type) { return llvm::isa(type); }); + }); + + // Now that the conversion target has been defined, we just need to provide + // the set of patterns that will lower the Toy operations. + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); + + // With the target and rewrite patterns defined, we can now attempt the + // conversion. The conversion will signal failure if any of our `illegal` + // operations were not converted successfully. + if (failed( + applyPartialConversion(getOperation(), target, std::move(patterns)))) + signalPassFailure(); +} + +/// Create a pass for lowering operations in the `Affine` and `Std` dialects, +/// for a subset of the Toy IR (e.g. matmul). +std::unique_ptr mlir::toy::createLowerToAffinePass() { + return std::make_unique(); +} diff --git a/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp b/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp new file mode 100644 index 0000000..58e59d9 --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/LowerToCudaTile.cpp @@ -0,0 +1,537 @@ +#include "cuda_tile/Dialect/CudaTile/IR/Types.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/DialectRegistry.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/TypeID.h" +#include "mlir/Transforms/DialectConversion.h" +#include "toy/Dialect.h" +#include "toy/Passes.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/DebugLog.h" +#include "llvm/Support/LogicalResult.h" + +#include "cuda_tile/Dialect/CudaTile/IR/Dialect.h" +#include "cuda_tile/Dialect/CudaTile/IR/Ops.h" + +#include +#include +#include +#include + +#define DEBUG_TYPE "toy-to-cuda-tile" + +void debugPrintShape(mlir::ArrayRef shape, + llvm::StringRef prefix = "") { + std::string shapeStr; + llvm::raw_string_ostream shapeOS(shapeStr); + shapeOS << "["; + llvm::interleaveComma(shape, shapeOS); + shapeOS << "]"; + shapeOS.flush(); + LDBG() << prefix << shapeStr; +} + +mlir::cuda_tile::MakeTensorViewOp +makeTensorViewForArg(mlir::OpBuilder &rewriter, mlir::Location loc, + mlir::Value arg, mlir::ArrayRef shape) { + auto resultType = rewriter.getI64ArrayAttr(shape); + LDBG() << "shape: " << resultType; + auto ptrElem = + llvm::dyn_cast(arg.getType()).getElementType(); + auto eleType = + llvm::dyn_cast(ptrElem).getPointeeType(); + mlir::cuda_tile::TensorViewType tensorViewType = + mlir::cuda_tile::TensorViewType::get(rewriter.getContext(), eleType, + shape, + /*strides=*/{shape.back(), 1}); + // LDBG() << "Creating TensorViewType: " << tensorViewType; + auto make_tensor_view = mlir::cuda_tile::MakeTensorViewOp::create( + rewriter, loc, tensorViewType, arg, + /*dynamicShape=*/mlir::ValueRange{}, + /*dynamicStrides=*/mlir::ValueRange{}); + return make_tensor_view; +} + +int64_t alignPower2(int x) { + int64_t power = 1; + while (power < x) { + power *= 2; + } + return power; +} + +static bool isFromFuncArg(mlir::Value v) { + if (auto barg = llvm::dyn_cast(v)) { + return true; + } + return false; +} + +static std::optional insertUnrealizedConversionCastOp( + mlir::Value opv, mlir::Value v, mlir::RankedTensorType logical, + mlir::PatternRewriter &rewriter, mlir::Location loc) { + auto tileTy = llvm::dyn_cast(v.getType()); + if (!tileTy) + return v; + + auto elemTy = tileTy.getElementType(); + auto ptrTy = llvm::dyn_cast(elemTy); + if (!ptrTy) + return v; + + if (!isFromFuncArg(opv)) { + auto logicalTy = llvm::cast(opv.getType()); + auto alignedI32 = llvm::to_vector<4>( + llvm::map_range(logicalTy.getShape(), [](int64_t dim) { + return static_cast(alignPower2(dim)); + })); + llvm::SmallVector alignedShape; + alignedShape.reserve(alignedI32.size()); + for (int32_t dim : alignedI32) + alignedShape.push_back(static_cast(dim)); + mlir::cuda_tile::TileType resultTileTy = + mlir::cuda_tile::TileType::get(alignedShape, ptrTy.getPointeeType()); + + // Here the TypeConverter will change the toy.add result type to + // tile>, but we actually need tile<...xf32> to do computation. so + // we need to insert a cast here. if we don't do this, the + // `UnrealizedConversionCastOp` will be automatically inserted later during + // conversion. like: %10 = "builtin.unrealized_conversion_cast"(%9) + // {__pure_type_conversion__} + // : (!cuda_tile.tile<2x4xf32>) -> + // !cuda_tile.tile> + // since the TypeConverter can not know which input is from function arg or + // not. so, here we do the cast manually to delete those cast Op since the + // `cuda_tile.add` can accept tile<...xf32> directly if args is not from the + // block arguments. + mlir::UnrealizedConversionCastOp castOp = + mlir::UnrealizedConversionCastOp::create(rewriter, loc, {resultTileTy}, + v); + return castOp.getResult(0); + } + + return std::nullopt; // no need to insert cast +} + +static mlir::cuda_tile::MakePartitionViewOp +makePartitionViewForArg(mlir::PatternRewriter &rewriter, mlir::Location loc, + mlir::Value v, mlir::RankedTensorType logical) { + // 1) make_tensor_view from tile> + auto tensorView = makeTensorViewForArg(rewriter, loc, v, logical.getShape()); + + // 2) 创建 partition_view,tile 形状使用 2 的幂对齐 + auto alignedI32 = + llvm::to_vector<4>(llvm::map_range(logical.getShape(), [](int64_t dim) { + return static_cast(alignPower2(dim)); + })); + auto partViewTy = mlir::cuda_tile::PartitionViewType::get( + rewriter.getContext(), rewriter.getDenseI32ArrayAttr(alignedI32), + llvm::dyn_cast( + tensorView->getResult(0).getType()), + /*partitions=*/{0, 1}, {}); + auto partView = mlir::cuda_tile::MakePartitionViewOp::create( + rewriter, loc, partViewTy, tensorView); + return partView; +} + +static mlir::Value ensureTileValue(mlir::Value opv, mlir::Value v, + mlir::RankedTensorType logical, + mlir::PatternRewriter &rewriter) { + auto loc = v.getLoc(); + auto maybeCastV = + insertUnrealizedConversionCastOp(opv, v, logical, rewriter, loc); + if (maybeCastV.has_value()) { + return maybeCastV.value(); + } + + auto alignedI32 = + llvm::to_vector<4>(llvm::map_range(logical.getShape(), [](int64_t dim) { + return static_cast(alignPower2(dim)); + })); + + auto partView = makePartitionViewForArg(rewriter, loc, v, logical); + + // 3) 准备索引常量和 load + auto i32TileTy = mlir::cuda_tile::TileType::get({}, rewriter.getI32Type()); + auto zeroAttr = + mlir::DenseIntElementsAttr::get(i32TileTy, llvm::ArrayRef{0}); + auto zeroIdx = + mlir::cuda_tile::ConstantOp::create(rewriter, loc, i32TileTy, zeroAttr); + + auto memOrd = mlir::cuda_tile::MemoryOrderingSemanticsAttr::get( + rewriter.getContext(), mlir::cuda_tile::MemoryOrderingSemantics::WEAK); + auto tokenTy = mlir::cuda_tile::TokenType::get(rewriter.getContext()); + + // auto memory_ordering_attr = + // mlir::cuda_tile::MemoryOrderingSemanticsAttr::get( + // rewriter.getContext(), + // mlir::cuda_tile::MemoryOrderingSemantics::WEAK); + + auto tensorViewTy = llvm::cast( + partView.getTensorView().getType()); + LDBG() << "TensorViewType for LoadViewTkoOp: " << tensorViewTy; + llvm::SmallVector alignedLoadShape(alignedI32.begin(), + alignedI32.end()); + debugPrintShape(alignedLoadShape); + + auto resTileTy = mlir::cuda_tile::TileType::get( + {alignedLoadShape.begin(), alignedLoadShape.end()}, + tensorViewTy.getElementType()); + auto load = mlir::cuda_tile::LoadViewTkoOp::create( + rewriter, loc, {resTileTy, tokenTy}, memOrd, {}, partView, + mlir::ValueRange{zeroIdx, zeroIdx}, {}, {}); + + return load.getResult(0); +} + +static mlir::Value ensureStoreValue(mlir::Value opv, mlir::Value v, + mlir::RankedTensorType logical, + mlir::PatternRewriter &rewriter) { + auto loc = v.getLoc(); + auto castOp = + insertUnrealizedConversionCastOp(opv, v, logical, rewriter, loc); + if (castOp.has_value()) { + return castOp.value(); + } + return v; +} + +//===----------------------------------------------------------------------===// +// 1) TypeConverter: tensor<...xf32> -> tile> (plus we will create +// views) +//===----------------------------------------------------------------------===// +struct ToyToCudaTileTypeConverter : public mlir::TypeConverter { + ToyToCudaTileTypeConverter(mlir::MLIRContext *ctx) { + addConversion([](mlir::Type t) { return t; }); // identity for others + + addConversion([&](mlir::RankedTensorType t) -> mlir::Type { + // Example: only handle f32 ranked tensor for now. + auto elemTy = llvm::dyn_cast(t.getElementType()); + if (!elemTy || elemTy.getWidth() != 32) + return {}; + + auto ptrElem = mlir::cuda_tile::PointerType::get(elemTy); + auto newType = mlir::cuda_tile::TileType::get({}, ptrElem); + + // tile> : the exact spelling depends on your cuda_tile dialect + // types. + return newType; + }); + + // Important: if you have tensor results too, you need a materialization + // strategy. e.g. create temporary buffers and store into them, or return + // ptr to output. + } +}; + +//===----------------------------------------------------------------------===// +// 2) Pattern: toy.gpu_func -> create cuda_tile.module entry +//===----------------------------------------------------------------------===// +struct LowerToyGPUFuncToCudaTileEntry + : public mlir::OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(mlir::toy::GPUFuncOp op, + mlir::toy::GPUFuncOp::Adaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + // Find / create cuda_tile.module container (you can also create once in + // pass) + auto moduleOp = op->getParentOfType(); + mlir::cuda_tile::ModuleOp cudaMod; + for (auto m : moduleOp.getOps()) { + cudaMod = m; + break; + } + + if (!cudaMod) { + rewriter.setInsertionPointToEnd(moduleOp.getBody()); + cudaMod = mlir::cuda_tile::ModuleOp::create(rewriter, op.getLoc(), + "cuda_tile_module"); + } + LDBG() << "Found / Created CudaTile Module: \n" << cudaMod; + + llvm::SmallVector entryArgTys; + llvm::SmallVector, 4> entryArgShapes; + for (auto t : op.getFunctionType().getInputs()) { + auto ct = getTypeConverter()->convertType(t); + if (!ct) + return rewriter.notifyMatchFailure(op, "cannot convert arg type"); + LDBG() << "Converted arg type: " << ct; + entryArgTys.push_back(ct); + auto rt = llvm::dyn_cast(t); + entryArgShapes.push_back(rt.getShape()); + } + + for (auto t : op.getFunctionType().getResults()) { + auto ct = getTypeConverter()->convertType(t); + if (!ct) + return rewriter.notifyMatchFailure(op, "cannot convert result type"); + LDBG() << "Converted result type: " << ct; + // Optionally, add as extra arg instead of return. + entryArgTys.push_back(ct); + auto rt = llvm::dyn_cast(t); + entryArgShapes.push_back(rt.getShape()); + } + auto newFnType = rewriter.getFunctionType(entryArgTys, {}); + + mlir::Block &bodyBlock = cudaMod.getBodyRegion().front(); + mlir::OpBuilder::InsertionGuard guard(rewriter); + + rewriter.setInsertionPointToStart(&bodyBlock); + + auto entry = mlir::cuda_tile::EntryOp::create( + rewriter, op.getLoc(), op.getSymName(), newFnType, + /*arg_attrs=*/{}, /*res_attrs=*/{}, {}); + + LDBG() << "CudaTile Module: \n" << cudaMod; + + auto *bb = entry.addEntryBlock(); + + rewriter.setInsertionPointToStart(bb); + // 1. create a get_tile_block_id op + auto tileBlockId = mlir::cuda_tile::GetTileBlockIdOp::create( + rewriter, op->getLoc(), + {mlir::cuda_tile::TileType::get({}, rewriter.getI32Type()), + mlir::cuda_tile::TileType::get({}, rewriter.getI32Type()), + mlir::cuda_tile::TileType::get({}, rewriter.getI32Type())}); + + llvm::SmallVector tensorViews; + + // for (auto [idx, arg] : llvm::enumerate(bb->getArguments())) { + // // 2. create a make_tensor_view op + // auto resultType = rewriter.getI64ArrayAttr(entryArgShapes[idx]); + // LDBG() << "Argument " << idx << " : " << arg << ", shape: " << + // resultType; auto ptrElem = + // llvm::dyn_cast(arg.getType()) + // .getElementType(); + // auto eleType = llvm::dyn_cast(ptrElem) + // .getPointeeType(); + // mlir::cuda_tile::TensorViewType tensorViewType = + // mlir::cuda_tile::TensorViewType::get( + // rewriter.getContext(), eleType, entryArgShapes[idx], + // /*strides=*/{entryArgShapes[idx].back(), 1}); + // // LDBG() << "Creating TensorViewType: " << tensorViewType; + // auto make_tensor_view = mlir::cuda_tile::MakeTensorViewOp::create( + // rewriter, op->getLoc(), tensorViewType, arg, + // /*dynamicShape=*/mlir::ValueRange{}, + // /*dynamicStrides=*/mlir::ValueRange{}); + // // LDBG() << "Created MakeTensorViewOp: \n" << make_tensor_view ; + // tensorViews.push_back(make_tensor_view.getResult()); + // } + for (auto [idx, arg] : llvm::enumerate(bb->getArguments())) { + tensorViews.push_back(arg); + } + + auto *srcBlock = &op.getBody().front(); + llvm::SmallVector argValues; + argValues.reserve(srcBlock->getNumArguments()); + for (unsigned i = 0; i < srcBlock->getNumArguments(); ++i) { + argValues.push_back(tensorViews[i]); + } + + auto *srcTerminator = srcBlock->getTerminator(); + rewriter.mergeBlocks(srcBlock, bb, argValues); + + auto retOp = mlir::cuda_tile::ReturnOp::create(rewriter, op.getLoc()); + + LDBG() << "Created CudaTile Entry Op: \n" << entry; + + // Erase old op. + rewriter.eraseOp(op); + return mlir::success(); + } +}; + +//===----------------------------------------------------------------------===// +// ToyToCudaTile Conversion Patterns: Binary operations +//===----------------------------------------------------------------------===// + +template +struct BinaryOpLowering : public mlir::OpConversionPattern { + using mlir::OpConversionPattern::OpConversionPattern; + using OpAdaptor = typename mlir::OpConversionPattern::OpAdaptor; + + llvm::LogicalResult + matchAndRewrite(BinaryOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const final { + auto loc = op->getLoc(); + auto logical = + llvm::dyn_cast(op.getResult().getType()); + if (!logical) { + return rewriter.notifyMatchFailure(op, "result is not RankedTensorType"); + } + auto lhsLoaded = + ensureTileValue(op.getLhs(), adaptor.getLhs(), logical, rewriter); + auto rhsLoaded = + ensureTileValue(op.getRhs(), adaptor.getRhs(), logical, rewriter); + + LDBG() << "After ensureTileValue LHS: " << lhsLoaded; + LDBG() << "After ensureTileValue RHS: " << rhsLoaded; + + auto tileTy = lhsLoaded.getType(); + auto binOp = LoweredBinaryOp::create(rewriter, loc, tileTy, lhsLoaded, + rhsLoaded, {}); + rewriter.replaceOp(op, binOp.getResult()); + return llvm::success(); + } +}; +using AddOpLowering = + BinaryOpLowering; +using MulOpLowering = + BinaryOpLowering; + +//===----------------------------------------------------------------------===// +// ToyToCudaTile Conversion Patterns: Return operations +//===----------------------------------------------------------------------===// + +struct ReturnLowering : public mlir::OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(mlir::toy::ReturnOp op, mlir::toy::ReturnOp::Adaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + auto loc = op->getLoc(); + auto outputPtr = op->getBlock()->getArguments().back(); + auto logical = + llvm::dyn_cast(op->getOperand(0).getType()); + if (!logical) { + return rewriter.notifyMatchFailure(op, "result is not RankedTensorType"); + } + auto retValLoaded = ensureStoreValue( + op.getOperand(0), adaptor.getOperands().front(), logical, rewriter); + LDBG() << "After ensureStoreValue RET: " << retValLoaded; + + auto partView = makePartitionViewForArg(rewriter, loc, outputPtr, logical); + + auto tkTy = mlir::cuda_tile::TokenType::get(rewriter.getContext()); + auto memoryOrd = mlir::cuda_tile::MemoryOrderingSemanticsAttr::get( + rewriter.getContext(), mlir::cuda_tile::MemoryOrderingSemantics::WEAK); + + auto i32TileTy = mlir::cuda_tile::TileType::get({}, rewriter.getI32Type()); + auto zeroAttr = + mlir::DenseIntElementsAttr::get(i32TileTy, llvm::ArrayRef{0}); + auto zeroIdx = + mlir::cuda_tile::ConstantOp::create(rewriter, loc, i32TileTy, zeroAttr); + + auto storeOp = mlir::cuda_tile::StoreViewTkoOp::create( + rewriter, loc, {tkTy}, memoryOrd, {}, retValLoaded, partView, + mlir::ValueRange{zeroIdx, zeroIdx}, {}, {}); + + rewriter.eraseOp(op); + return mlir::success(); + } +}; + +//===----------------------------------------------------------------------===// +// ToyToCudaTileLoweringPass +//===----------------------------------------------------------------------===// + +namespace { +struct ToyToCudaTileLoweringPass + : public mlir::PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToCudaTileLoweringPass) + + llvm::StringRef getArgument() const override { return "toy-to-cuda-tile"; } + + void getDependentDialects(mlir::DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() final; +}; +}; // namespace + +mlir::cuda_tile::ModuleOp createCudaModuleOp(mlir::OpBuilder &builder, + mlir::ModuleOp &moduleOp) { + mlir::OpBuilder::InsertionGuard guard(builder); + + builder.setInsertionPoint(moduleOp.getBody(), moduleOp.getBody()->end()); + auto cudaTileModuleOp = mlir::cuda_tile::ModuleOp::create( + builder, moduleOp.getLoc(), "cuda_tile_module"); + + LDBG() << "Created CudaTile Module: \n" << cudaTileModuleOp; + return cudaTileModuleOp; +} + +void ToyToCudaTileLoweringPass::runOnOperation() { + auto moduleOp = getOperation(); + auto *ctx = moduleOp.getContext(); + mlir::ConversionTarget target(*ctx); + target.addLegalDialect(); + target.addLegalOp(); + + // Keep host-side toy.func/main legal (or lower it later). + target.addLegalOp(); + + target.addIllegalOp(); + + moduleOp.walk([&](mlir::toy::GPUFuncOp gfun) { + ToyToCudaTileTypeConverter typeConverter(ctx); + mlir::RewritePatternSet patterns(ctx); + + patterns.add(typeConverter, ctx); + + if (mlir::failed( + mlir::applyFullConversion(gfun, target, std::move(patterns)))) + signalPassFailure(); + }); + + // ------------------------------- + // auto moduleOp = getOperation(); + // auto *ctx = moduleOp.getContext(); + // // The first thing to define is the conversion target. This will define the + // // final target for this lowering. + // mlir::ConversionTarget target(*ctx); + + // target.addLegalDialect(); + // target.addLegalOp(); + + // // Keep host-side toy.func/main legal (or lower it later). + // target.addLegalOp(); + + // target + // .addIllegalOp(); + + // ToyToCudaTileTypeConverter typeConv(&*ctx); + + // mlir::RewritePatternSet patterns(&*ctx); + // patterns.add(typeConv, &*ctx); + + // // TODO: add patterns for toy.transpose/toy.matmul/toy.add/toy.mul + // patterns.add(typeConv, ctx); + + // if (mlir::failed(mlir::applyPartialConversion(moduleOp, target, + // std::move(patterns)))) { + // signalPassFailure(); + // } + // ------------------------------- +} + +namespace mlir::toy { + +std::unique_ptr createCudaTileLoweringPass() { + return std::make_unique(); +}; + +}; // namespace mlir::toy diff --git a/mlir/cuda-tile/Toy/mlir/LowerToGpu.cpp b/mlir/cuda-tile/Toy/mlir/LowerToGpu.cpp new file mode 100644 index 0000000..cf2bb4b --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/LowerToGpu.cpp @@ -0,0 +1,287 @@ +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/SymbolTable.h" +#include "mlir/IR/Types.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/TypeID.h" +#include "toy/Dialect.h" +#include "toy/Passes.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/DebugLog.h" + +#include +#include + +#define DEBUG_TYPE "toy-gpu-outline" + +namespace { + +static bool isGpuOperation(mlir::Operation *op, + const llvm::SmallSet &gpuOps) { + llvm::StringRef opName = op->getName().getStringRef().split('.').second; + return gpuOps.contains(opName); +} + +static llvm::SmallVector parseGrid(llvm::StringRef gridStr) { + llvm::SmallVector dims; + llvm::SmallVector pieces; + gridStr.split(pieces, ','); + for (llvm::StringRef piece : pieces) { + int64_t value = 0; + if (!piece.empty() && llvm::to_integer(piece.trim(), value)) + dims.push_back(value); + } + if (dims.size() != 3) + dims = {1, 1, 1}; + return dims; +} + +struct GpuOutlinePass + : public mlir::PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GpuOutlinePass) + + std::string grid{"1,1,1"}; + + llvm::StringRef getArgument() const override { return "toy-gpu-outline"; } + + void initializeOptions(std::string grid) { this->grid = grid; } + + void runOnOperation() override { + auto func = getOperation(); + if (func.getName() != "main") + return; + + llvm::SmallSet gpuOperations = {"matmul", "add", "mul", + "transpose"}; + + // // Collect GPU-eligible ops in block order for deterministic cloning. + // llvm::SmallDenseSet gpuOpSet; + // llvm::SmallVector gpuOps; + + // for (mlir::Operation &op : func.front()) { + // if (isGpuOperation(&op, gpuOperations)) { + // gpuOpSet.insert(&op); + // gpuOps.push_back(&op); + // } + // } + + // if (gpuOps.empty()) + // return; + + llvm::SmallVector gridDims = parseGrid(grid); + + llvm::SmallVector> gpuSubgraphs; + + // Find a gpu subgraph like + // [[gpuOps, ...], [gpuOps, ...], ...] + // original sequence: + // [..., non-gpu-op, [gpu-op, gpu-op], non-gpu-op, [gpu-op, ...]] + func.walk([&](mlir::Operation *op) { + if (isGpuOperation(op, gpuOperations)) { + if (gpuSubgraphs.empty()) { + gpuSubgraphs.push_back({op}); + } else { + gpuSubgraphs.back().push_back(op); + } + } else { + if (gpuSubgraphs.empty()) { + gpuSubgraphs.push_back({}); + } else if (!gpuSubgraphs.back().empty()) { + gpuSubgraphs.push_back({}); + } + } + }); + + if (gpuSubgraphs.empty()) + return; + + bool allEmpty = llvm::all_of( + gpuSubgraphs, [](const llvm::SmallVector &sg) { + return sg.empty(); + }); + + if (allEmpty) + return; + + if (gpuSubgraphs.back().empty()) { + gpuSubgraphs.pop_back(); + } + + for (const auto &gpuSubgraph : gpuSubgraphs) { + LDBG() << "----GPU subgraph----"; + for (const auto &op : gpuSubgraph) { + LDBG() << *op; + } + LDBG() << "--------------------"; + } + + llvm::SmallVector outlinedFuncNames; + llvm::SmallVector insertPoints; + + // the logic to outline each gpu subgraph + // 1. find operands or input for the subgraph (exclude the input inside + // subgraph). + // 2. find results or output for the subgraph (exclude the output inside + // subgraph). + // 3. create a new function with operands as input and results as output. + // 4. insert a LaunchGpuOp to call the outlined function at the insert point + + for (const auto &[index, gpuSubgraph] : llvm::enumerate(gpuSubgraphs)) { + if (!gpuSubgraph.empty()) { + LDBG() << "----GPU subgraph----"; + for (const auto &op : gpuSubgraph) { + LDBG() << *op; + } + + // Identify its operands. + llvm::SmallVector Operands; + llvm::SmallPtrSet OperandSet; + for (mlir::Operation *op : gpuSubgraph) { + for (mlir::Value operand : op->getOperands()) { + auto *def = operand.getDefiningOp(); + if (!def || !isGpuOperation(def, gpuOperations)) { + if (OperandSet.insert(operand).second) + Operands.push_back(operand); + } + } + } + + LDBG() << "Operands:"; + for (mlir::Value &operand : Operands) { + LDBG() << " " << operand; + } + + llvm::SmallVector Results; + llvm::SmallPtrSet ResultSet; + + for (mlir::Operation *op : gpuSubgraph) { + for (mlir::Value result : op->getResults()) { + bool escapes = + llvm::any_of(result.getUsers(), [&](mlir::Operation *user) { + return !isGpuOperation(user, gpuOperations); + }); + if (escapes && ResultSet.insert(result).second) + Results.push_back(result); + } + } + + LDBG() << "Results:"; + for (mlir::Value &result : Results) { + LDBG() << " " << result; + } + + if (Results.size() != 1) { + llvm::errs() + << "Currently only support single result GPU kernel " + << "Since the toy return op only supports single return value " + << "Found " << Results.size() << " results"; + return signalPassFailure(); + } + + // buid the kernel for each subgraph + llvm::SmallVector argTypes; + argTypes.reserve(Operands.size()); + for (mlir::Value v : Operands) + argTypes.push_back(v.getType()); + + llvm::SmallVector resultTypes; + resultTypes.reserve(Results.size()); + for (mlir::Value v : Results) + resultTypes.push_back(v.getType()); + + mlir::ModuleOp module = func->getParentOfType(); + mlir::SymbolTable symbolTable(module); + std::string outline_func_name = + "outlined_gpu_kernel_" + std::to_string(index); + + unsigned suffix = 0; + while (symbolTable.lookup(outline_func_name)) + outline_func_name = + outline_func_name + "_" + std::to_string(++suffix); + + insertPoints.push_back(gpuSubgraph.front()); + + { + mlir::OpBuilder moduleBuilder(module.getContext()); + mlir::OpBuilder::InsertionGuard guard(moduleBuilder); + moduleBuilder.setInsertionPointToEnd(module.getBody()); + auto funcType = moduleBuilder.getFunctionType(argTypes, resultTypes); + auto gpuFunc = mlir::toy::GPUFuncOp::create( + moduleBuilder, func.getLoc(), outline_func_name, funcType); + + mlir::Block &kernelEntry = gpuFunc.getBody().front(); + mlir::OpBuilder kernelBuilder = + mlir::OpBuilder::atBlockEnd(&kernelEntry); + + mlir::IRMapping mapping; + for (auto [blockArg, captured] : + llvm::zip(kernelEntry.getArguments(), Operands)) + mapping.map(captured, blockArg); + + for (mlir::Operation *op : gpuSubgraph) { + kernelBuilder.clone(*op, mapping); + } + llvm::SmallVector mappedResults; + mappedResults.reserve(Results.size()); + for (mlir::Value res : Results) + mappedResults.push_back(mapping.lookup(res)); + mlir::toy::ReturnOp::create(kernelBuilder, func.getLoc(), + mappedResults); + + LDBG() << "Created GPU kernel: " << gpuFunc; + } + + outlinedFuncNames.push_back(outline_func_name); + + { + mlir::OpBuilder hostBuilder(func.getContext()); + mlir::OpBuilder::InsertionGuard guard(hostBuilder); + // Insert the host launch in place of the first outlined op. + hostBuilder.setInsertionPoint(gpuSubgraph.back()->getNextNode()); + + auto calleeAttr = mlir::SymbolRefAttr::get( + func.getContext(), llvm::StringRef(outline_func_name)); + + auto gridAttr = hostBuilder.getDenseI64ArrayAttr(gridDims); + + auto launch = mlir::toy::LaunchGpuOp::create( + hostBuilder, func.getLoc(), resultTypes, Operands, + {{"callee", calleeAttr}, {"grid", gridAttr}}); + + for (auto [idx, res] : llvm::enumerate(Results)) + res.replaceAllUsesWith(launch.getResult(idx)); + + for (mlir::Operation *op : llvm::reverse(gpuSubgraph)) + op->erase(); + LDBG() << "Inserted LaunchGpuOp: " << launch; + } + LDBG() << "--------------------"; + } + } + }; +}; +}; // namespace + +namespace mlir::toy { + +std::unique_ptr createGpuOutlinePass(std::string grid) { + auto pass = std::make_unique(); + pass->initializeOptions(grid); // You can change the grid dimensions here + return pass; +}; + +}; // namespace mlir::toy diff --git a/mlir/cuda-tile/Toy/mlir/LowerToLLVM.cpp b/mlir/cuda-tile/Toy/mlir/LowerToLLVM.cpp new file mode 100644 index 0000000..ad6c5bb --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/LowerToLLVM.cpp @@ -0,0 +1,248 @@ +//====- LowerToLLVM.cpp - Lowering from Toy+Affine+Std to LLVM ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements full lowering of Toy operations to LLVM MLIR dialect. +// 'toy.print' is lowered to a loop nest that calls `printf` on each element of +// the input array. The file also sets up the ToyToLLVMLoweringPass. This pass +// lowers the combination of Arithmetic + Affine + SCF + Func dialects to the +// LLVM one: +// +// Affine -- +// | +// v +// Arithmetic + Func --> LLVM (Dialect) +// ^ +// | +// 'toy.print' --> Loop (SCF) -- +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/LLVMIR/LLVMAttrs.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/TypeID.h" +#include "toy/Dialect.h" +#include "toy/Passes.h" + +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" +#include "mlir/Conversion/LLVMCommon/ConversionTarget.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/Casting.h" +#include +#include + +using namespace mlir; + +//===----------------------------------------------------------------------===// +// ToyToLLVM Conversion Patterns +//===----------------------------------------------------------------------===// + +namespace { +/// Lowers `toy.print` to a loop nest calling `printf` on each of the individual +/// elements of the array. +class PrintOpLowering : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(toy::PrintOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto *context = rewriter.getContext(); + auto memRefType = llvm::cast((*op->operand_type_begin())); + auto memRefShape = memRefType.getShape(); + auto loc = op->getLoc(); + + ModuleOp parentModule = op->getParentOfType(); + + // Get a symbol reference to the printf function, inserting it if necessary. + auto printfRef = getOrInsertPrintf(rewriter, parentModule); + Value formatSpecifierCst = getOrCreateGlobalString( + loc, rewriter, "frmt_spec", StringRef("%f \0", 4), parentModule); + Value newLineCst = getOrCreateGlobalString( + loc, rewriter, "nl", StringRef("\n\0", 2), parentModule); + + // Create a loop for each of the dimensions within the shape. + SmallVector loopIvs; + for (unsigned i = 0, e = memRefShape.size(); i != e; ++i) { + auto lowerBound = arith::ConstantIndexOp::create(rewriter, loc, 0); + auto upperBound = + arith::ConstantIndexOp::create(rewriter, loc, memRefShape[i]); + auto step = arith::ConstantIndexOp::create(rewriter, loc, 1); + auto loop = + scf::ForOp::create(rewriter, loc, lowerBound, upperBound, step); + for (Operation &nested : make_early_inc_range(*loop.getBody())) + rewriter.eraseOp(&nested); + loopIvs.push_back(loop.getInductionVar()); + + // Terminate the loop body. + rewriter.setInsertionPointToEnd(loop.getBody()); + + // Insert a newline after each of the inner dimensions of the shape. + if (i != e - 1) + LLVM::CallOp::create(rewriter, loc, getPrintfType(context), printfRef, + newLineCst); + scf::YieldOp::create(rewriter, loc); + rewriter.setInsertionPointToStart(loop.getBody()); + } + + // Generate a call to printf for the current element of the loop. + auto elementLoad = + memref::LoadOp::create(rewriter, loc, op.getInput(), loopIvs); + + // Varargs promotion: float -> double + Value arg = elementLoad; + Type t = elementLoad.getType(); + if (t.isF32()) { + arg = arith::ExtFOp::create(rewriter, loc, rewriter.getF64Type(), arg); + } else if (!t.isF64()) { + return rewriter.notifyMatchFailure(op, "toy.print only supports f32/f64"); + } + + LLVM::CallOp::create(rewriter, loc, getPrintfType(context), printfRef, + ArrayRef({formatSpecifierCst, arg})); + + // Notify the rewriter that this operation has been removed. + rewriter.eraseOp(op); + return success(); + } + +private: + /// Create a function declaration for printf, the signature is: + /// * `i32 (i8*, ...)` + static LLVM::LLVMFunctionType getPrintfType(MLIRContext *context) { + auto llvmI32Ty = IntegerType::get(context, 32); + auto llvmPtrTy = LLVM::LLVMPointerType::get(context); + auto llvmFnType = LLVM::LLVMFunctionType::get(llvmI32Ty, llvmPtrTy, + /*isVarArg=*/true); + return llvmFnType; + } + + /// Return a symbol reference to the printf function, inserting it into the + /// module if necessary. + static FlatSymbolRefAttr getOrInsertPrintf(PatternRewriter &rewriter, + ModuleOp module) { + auto *context = module.getContext(); + if (module.lookupSymbol("printf")) + return SymbolRefAttr::get(context, "printf"); + + // Insert the printf function into the body of the parent module. + PatternRewriter::InsertionGuard insertGuard(rewriter); + rewriter.setInsertionPointToStart(module.getBody()); + LLVM::LLVMFuncOp::create(rewriter, module.getLoc(), "printf", + getPrintfType(context)); + return SymbolRefAttr::get(context, "printf"); + } + + /// Return a value representing an access into a global string with the given + /// name, creating the string if necessary. + static Value getOrCreateGlobalString(Location loc, OpBuilder &builder, + StringRef name, StringRef value, + ModuleOp module) { + // Create the global at the entry of the module. + LLVM::GlobalOp global; + if (!(global = module.lookupSymbol(name))) { + OpBuilder::InsertionGuard insertGuard(builder); + builder.setInsertionPointToStart(module.getBody()); + auto type = LLVM::LLVMArrayType::get( + IntegerType::get(builder.getContext(), 8), value.size()); + global = LLVM::GlobalOp::create(builder, loc, type, /*isConstant=*/true, + LLVM::Linkage::Internal, name, + builder.getStringAttr(value), + /*alignment=*/0); + } + + // Get the pointer to the first character in the global string. + Value globalPtr = LLVM::AddressOfOp::create(builder, loc, global); + Value cst0 = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(), + builder.getIndexAttr(0)); + return LLVM::GEPOp::create( + builder, loc, LLVM::LLVMPointerType::get(builder.getContext()), + global.getType(), globalPtr, ArrayRef({cst0, cst0})); + } +}; +} // namespace + +//===----------------------------------------------------------------------===// +// ToyToLLVMLoweringPass +//===----------------------------------------------------------------------===// + +namespace { +struct ToyToLLVMLoweringPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToLLVMLoweringPass) + StringRef getArgument() const override { return "toy-to-llvm"; } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() final; +}; +} // namespace + +void ToyToLLVMLoweringPass::runOnOperation() { + // The first thing to define is the conversion target. This will define the + // final target for this lowering. For this lowering, we are only targeting + // the LLVM dialect. + LLVMConversionTarget target(getContext()); + target.addLegalOp(); + + // During this lowering, we will also be lowering the MemRef types, that are + // currently being operated on, to a representation in LLVM. To perform this + // conversion we use a TypeConverter as part of the lowering. This converter + // details how one type maps to another. This is necessary now that we will be + // doing more complicated lowerings, involving loop region arguments. + LLVMTypeConverter typeConverter(&getContext()); + + // Now that the conversion target has been defined, we need to provide the + // patterns used for lowering. At this point of the compilation process, we + // have a combination of `toy`, `affine`, and `std` operations. Luckily, there + // are already exists a set of patterns to transform `affine` and `std` + // dialects. These patterns lowering in multiple stages, relying on transitive + // lowerings. Transitive lowering, or A->B->C lowering, is when multiple + // patterns must be applied to fully transform an illegal operation into a + // set of legal ones. + RewritePatternSet patterns(&getContext()); + populateAffineToStdConversionPatterns(patterns); + populateSCFToControlFlowConversionPatterns(patterns); + mlir::arith::populateArithToLLVMConversionPatterns(typeConverter, patterns); + populateFinalizeMemRefToLLVMConversionPatterns(typeConverter, patterns); + cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns); + populateFuncToLLVMConversionPatterns(typeConverter, patterns); + + // The only remaining operation to lower from the `toy` dialect, is the + // PrintOp. + patterns.add(&getContext()); + + // We want to completely lower to LLVM, so we use a `FullConversion`. This + // ensures that only legal operations will remain after the conversion. + auto module = getOperation(); + if (failed(applyFullConversion(module, target, std::move(patterns)))) + signalPassFailure(); +} + +/// Create a pass for lowering operations the remaining `Toy` operations, as +/// well as `Affine` and `Std`, to the LLVM dialect for codegen. +std::unique_ptr mlir::toy::createLowerToLLVMPass() { + return std::make_unique(); +} diff --git a/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp b/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp new file mode 100644 index 0000000..bc1a972 --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/MLIRGen.cpp @@ -0,0 +1,468 @@ +//===- MLIRGen.cpp - MLIR Generation from a Toy AST -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple IR generation targeting MLIR from a Module AST +// for the Toy language. +// +//===----------------------------------------------------------------------===// + +#include "toy/MLIRGen.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/Value.h" +#include "toy/AST.h" +#include "toy/Dialect.h" + +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Verifier.h" +#include "toy/Lexer.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include +#include +#include +#include +#include +#include + +using namespace mlir::toy; +using namespace toy; + +using llvm::ArrayRef; +using llvm::cast; +using llvm::dyn_cast; +using llvm::isa; +using llvm::ScopedHashTableScope; +using llvm::SmallVector; +using llvm::StringRef; +using llvm::Twine; + +namespace { + +/// Implementation of a simple MLIR emission from the Toy AST. +/// +/// This will emit operations that are specific to the Toy language, preserving +/// the semantics of the language and (hopefully) allow to perform accurate +/// analysis and transformation based on these high level semantics. +class MLIRGenImpl { +public: + MLIRGenImpl(mlir::MLIRContext &context) : builder(&context) {} + + /// Public API: convert the AST for a Toy module (source file) to an MLIR + /// Module operation. + mlir::ModuleOp mlirGen(ModuleAST &moduleAST) { + // We create an empty MLIR module and codegen functions one at a time and + // add them to the module. + theModule = mlir::ModuleOp::create(builder.getUnknownLoc()); + + for (FunctionAST &f : moduleAST) + mlirGen(f); + + // Verify the module after we have finished constructing it, this will check + // the structural properties of the IR and invoke any specific verifiers we + // have on the Toy operations. + if (failed(mlir::verify(theModule))) { + theModule.emitError("module verification error"); + return nullptr; + } + + return theModule; + } + +private: + /// A "module" matches a Toy source file: containing a list of functions. + mlir::ModuleOp theModule; + + /// The builder is a helper class to create IR inside a function. The builder + /// is stateful, in particular it keeps an "insertion point": this is where + /// the next operations will be introduced. + mlir::OpBuilder builder; + + /// The symbol table maps a variable name to a value in the current scope. + /// Entering a function creates a new scope, and the function arguments are + /// added to the mapping. When the processing of a function is terminated, the + /// scope is destroyed and the mappings created in this scope are dropped. + llvm::ScopedHashTable symbolTable; + + /// Helper conversion for a Toy AST location to an MLIR location. + mlir::Location loc(const Location &loc) { + return mlir::FileLineColLoc::get(builder.getStringAttr(*loc.file), loc.line, + loc.col); + } + + /// Declare a variable in the current scope, return success if the variable + /// wasn't declared yet. + llvm::LogicalResult declare(llvm::StringRef var, mlir::Value value) { + if (symbolTable.count(var)) + return mlir::failure(); + symbolTable.insert(var, value); + return mlir::success(); + } + + /// Create the prototype for an MLIR function with as many arguments as the + /// provided Toy AST prototype. + mlir::toy::FuncOp mlirGen(PrototypeAST &proto) { + auto location = loc(proto.loc()); + + // This is a generic function, the return type will be inferred later. + // Arguments type are uniformly unranked tensors. + llvm::SmallVector argTypes(proto.getArgs().size(), + getType(VarType{})); + auto funcType = builder.getFunctionType(argTypes, /*results=*/{}); + return mlir::toy::FuncOp::create(builder, location, proto.getName(), + funcType); + } + + /// Emit a new function and add it to the MLIR module. + mlir::toy::FuncOp mlirGen(FunctionAST &funcAST) { + // Create a scope in the symbol table to hold variable declarations. + ScopedHashTableScope varScope(symbolTable); + + // Create an MLIR function for the given prototype. + builder.setInsertionPointToEnd(theModule.getBody()); + mlir::toy::FuncOp function = mlirGen(*funcAST.getProto()); + if (!function) + return nullptr; + + // Let's start the body of the function now! + mlir::Block &entryBlock = function.front(); + auto protoArgs = funcAST.getProto()->getArgs(); + + // Declare all the function arguments in the symbol table. + for (const auto nameValue : + llvm::zip(protoArgs, entryBlock.getArguments())) { + if (failed(declare(std::get<0>(nameValue)->getName(), + std::get<1>(nameValue)))) + return nullptr; + } + + // Set the insertion point in the builder to the beginning of the function + // body, it will be used throughout the codegen to create operations in this + // function. + builder.setInsertionPointToStart(&entryBlock); + + // Emit the body of the function. + if (mlir::failed(mlirGen(*funcAST.getBody()))) { + function.erase(); + return nullptr; + } + + // Implicitly return void if no return statement was emitted. + // FIXME: we may fix the parser instead to always return the last expression + // (this would possibly help the REPL case later) + ReturnOp returnOp; + if (!entryBlock.empty()) + returnOp = dyn_cast(entryBlock.back()); + if (!returnOp) { + ReturnOp::create(builder, loc(funcAST.getProto()->loc())); + } else if (returnOp.hasOperand()) { + // Otherwise, if this return operation has an operand then add a result to + // the function. + function.setType(builder.getFunctionType( + function.getFunctionType().getInputs(), getType(VarType{}))); + } + + // If this function isn't main, then set the visibility to private. + if (funcAST.getProto()->getName() != "main") + function.setPrivate(); + + return function; + } + + /// Emit a binary operation + mlir::Value mlirGen(BinaryExprAST &binop) { + // First emit the operations for each side of the operation before emitting + // the operation itself. For example if the expression is `a + foo(a)` + // 1) First it will visiting the LHS, which will return a reference to the + // value holding `a`. This value should have been emitted at declaration + // time and registered in the symbol table, so nothing would be + // codegen'd. If the value is not in the symbol table, an error has been + // emitted and nullptr is returned. + // 2) Then the RHS is visited (recursively) and a call to `foo` is emitted + // and the result value is returned. If an error occurs we get a nullptr + // and propagate. + // + mlir::Value lhs = mlirGen(*binop.getLHS()); + if (!lhs) + return nullptr; + mlir::Value rhs = mlirGen(*binop.getRHS()); + if (!rhs) + return nullptr; + auto location = loc(binop.loc()); + + // Derive the operation name from the binary operator. At the moment we only + // support '+' and '*'. + switch (binop.getOp()) { + case '+': + return AddOp::create(builder, location, lhs, rhs); + case '*': + return MulOp::create(builder, location, lhs, rhs); + } + + emitError(location, "invalid binary operator '") << binop.getOp() << "'"; + return nullptr; + } + + /// This is a reference to a variable in an expression. The variable is + /// expected to have been declared and so should have a value in the symbol + /// table, otherwise emit an error and return nullptr. + mlir::Value mlirGen(VariableExprAST &expr) { + if (auto variable = symbolTable.lookup(expr.getName())) + return variable; + + emitError(loc(expr.loc()), "error: unknown variable '") + << expr.getName() << "'"; + return nullptr; + } + + /// Emit a return operation. This will return failure if any generation fails. + llvm::LogicalResult mlirGen(ReturnExprAST &ret) { + auto location = loc(ret.loc()); + + // 'return' takes an optional expression, handle that case here. + mlir::Value expr = nullptr; + if (ret.getExpr().has_value()) { + if (!(expr = mlirGen(**ret.getExpr()))) + return mlir::failure(); + } + + // Otherwise, this return operation has zero operands. + ReturnOp::create(builder, location, + expr ? ArrayRef(expr) : ArrayRef()); + return mlir::success(); + } + + /// Emit a literal/constant array. It will be emitted as a flattened array of + /// data in an Attribute attached to a `toy.constant` operation. + /// See documentation on [Attributes](LangRef.md#attributes) for more details. + /// Here is an excerpt: + /// + /// Attributes are the mechanism for specifying constant data in MLIR in + /// places where a variable is never allowed [...]. They consist of a name + /// and a concrete attribute value. The set of expected attributes, their + /// structure, and their interpretation are all contextually dependent on + /// what they are attached to. + /// + /// Example, the source level statement: + /// var a<2, 3> = [[1, 2, 3], [4, 5, 6]]; + /// will be converted to: + /// %0 = "toy.constant"() {value: dense, + /// [[1.000000e+00, 2.000000e+00, 3.000000e+00], + /// [4.000000e+00, 5.000000e+00, 6.000000e+00]]>} : () -> tensor<2x3xf32> + /// + mlir::Value mlirGen(LiteralExprAST &lit) { + auto type = getType(lit.getDims()); + + // The attribute is a vector with a floating point value per element + // (number) in the array, see `collectData()` below for more details. + std::vector data; + data.reserve(llvm::product_of(lit.getDims())); + collectData(lit, data); + + // The type of this attribute is tensor of 64-bit floating-point with the + // shape of the literal. + mlir::Type elementType = builder.getF32Type(); + auto dataType = mlir::RankedTensorType::get(lit.getDims(), elementType); + + // This is the actual attribute that holds the list of values for this + // tensor literal. + auto dataAttribute = + mlir::DenseElementsAttr::get(dataType, llvm::ArrayRef(data)); + + // Build the MLIR op `toy.constant`. This invokes the `ConstantOp::build` + // method. + return ConstantOp::create(builder, loc(lit.loc()), type, dataAttribute); + } + + /// Recursive helper function to accumulate the data that compose an array + /// literal. It flattens the nested structure in the supplied vector. For + /// example with this array: + /// [[1, 2], [3, 4]] + /// we will generate: + /// [ 1, 2, 3, 4 ] + /// Individual numbers are represented as floats. + /// Attributes are the way MLIR attaches constant to operations. + void collectData(ExprAST &expr, std::vector &data) { + if (auto *lit = dyn_cast(&expr)) { + for (auto &value : lit->getValues()) + collectData(*value, data); + return; + } + + assert(isa(expr) && "expected literal or number expr"); + data.push_back(cast(expr).getValue()); + } + + /// Emit a call expression. It emits specific operations for the `transpose` + /// builtin. Other identifiers are assumed to be user-defined functions. + mlir::Value mlirGen(CallExprAST &call) { + llvm::StringRef callee = call.getCallee(); + auto location = loc(call.loc()); + + // Codegen the operands first. + SmallVector operands; + for (auto &expr : call.getArgs()) { + auto arg = mlirGen(*expr); + if (!arg) + return nullptr; + operands.push_back(arg); + } + + // Builtin calls have their custom operation, meaning this is a + // straightforward emission. + if (callee == "transpose") { + if (call.getArgs().size() != 1) { + emitError(location, "MLIR codegen encountered an error: toy.transpose " + "does not accept multiple arguments"); + return nullptr; + } + return TransposeOp::create(builder, location, operands[0]); + } + + if (callee == "matmul") { + if (call.getArgs().size() != 2) { + emitError(location, "MLIR codegen encountered an error: toy.matmul " + "expected 2 arguments"); + return nullptr; + } + return MatMulOp::create(builder, location, operands[0], operands[1]); + } + + // Otherwise this is a call to a user-defined function. Calls to + // user-defined functions are mapped to a custom call that takes the callee + // name as an attribute. + return GenericCallOp::create(builder, location, callee, operands); + } + + /// Emit a print expression. It emits specific operations for two builtins: + /// transpose(x) and print(x). + llvm::LogicalResult mlirGen(PrintExprAST &call) { + auto arg = mlirGen(*call.getArg()); + if (!arg) + return mlir::failure(); + + PrintOp::create(builder, loc(call.loc()), arg); + return mlir::success(); + } + + /// Emit a constant for a single number (FIXME: semantic? broadcast?) + mlir::Value mlirGen(NumberExprAST &num) { + return ConstantOp::create(builder, loc(num.loc()), num.getValue()); + } + + /// Dispatch codegen for the right expression subclass using RTTI. + mlir::Value mlirGen(ExprAST &expr) { + switch (expr.getKind()) { + case toy::ExprAST::Expr_BinOp: + return mlirGen(cast(expr)); + case toy::ExprAST::Expr_Var: + return mlirGen(cast(expr)); + case toy::ExprAST::Expr_Literal: + return mlirGen(cast(expr)); + case toy::ExprAST::Expr_Call: + return mlirGen(cast(expr)); + case toy::ExprAST::Expr_Num: + return mlirGen(cast(expr)); + default: + emitError(loc(expr.loc())) + << "MLIR codegen encountered an unhandled expr kind '" + << Twine(expr.getKind()) << "'"; + return nullptr; + } + } + + /// Handle a variable declaration, we'll codegen the expression that forms the + /// initializer and record the value in the symbol table before returning it. + /// Future expressions will be able to reference this variable through symbol + /// table lookup. + mlir::Value mlirGen(VarDeclExprAST &vardecl) { + auto *init = vardecl.getInitVal(); + if (!init) { + emitError(loc(vardecl.loc()), + "missing initializer in variable declaration"); + return nullptr; + } + + mlir::Value value = mlirGen(*init); + if (!value) + return nullptr; + + // We have the initializer value, but in case the variable was declared + // with specific shape, we emit a "reshape" operation. It will get + // optimized out later as needed. + if (!vardecl.getType().shape.empty()) { + value = ReshapeOp::create(builder, loc(vardecl.loc()), + getType(vardecl.getType()), value); + } + + // Register the value in the symbol table. + if (failed(declare(vardecl.getName(), value))) + return nullptr; + return value; + } + + /// Codegen a list of expression, return failure if one of them hit an error. + llvm::LogicalResult mlirGen(ExprASTList &blockAST) { + ScopedHashTableScope varScope(symbolTable); + for (auto &expr : blockAST) { + // Specific handling for variable declarations, return statement, and + // print. These can only appear in block list and not in nested + // expressions. + if (auto *vardecl = dyn_cast(expr.get())) { + if (!mlirGen(*vardecl)) + return mlir::failure(); + continue; + } + if (auto *ret = dyn_cast(expr.get())) + return mlirGen(*ret); + if (auto *print = dyn_cast(expr.get())) { + if (mlir::failed(mlirGen(*print))) + return mlir::success(); + continue; + } + + // Generic expression dispatch codegen. + if (!mlirGen(*expr)) + return mlir::failure(); + } + return mlir::success(); + } + + /// Build a tensor type from a list of shape dimensions. + mlir::Type getType(ArrayRef shape) { + // If the shape is empty, then this type is unranked. + if (shape.empty()) + return mlir::UnrankedTensorType::get(builder.getF32Type()); + + // Otherwise, we use the given shape. + return mlir::RankedTensorType::get(shape, builder.getF32Type()); + } + + /// Build an MLIR type from a Toy AST variable type (forward to the generic + /// getType above). + mlir::Type getType(const VarType &type) { return getType(type.shape); } +}; + +} // namespace + +namespace toy { + +// The public API for codegen. +mlir::OwningOpRef mlirGen(mlir::MLIRContext &context, + ModuleAST &moduleAST) { + return MLIRGenImpl(context).mlirGen(moduleAST); +} + +} // namespace toy diff --git a/mlir/cuda-tile/Toy/mlir/ShapeInferencePass.cpp b/mlir/cuda-tile/Toy/mlir/ShapeInferencePass.cpp new file mode 100644 index 0000000..a552e1f --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/ShapeInferencePass.cpp @@ -0,0 +1,123 @@ +//===- ShapeInferencePass.cpp - Shape Inference ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a Function level pass performing interprocedural +// propagation of array shapes through function specialization. +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Types.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/TypeID.h" +#include "toy/Dialect.h" +#include "toy/Passes.h" +#include "toy/ShapeInferenceInterface.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/DebugLog.h" +#include "llvm/Support/raw_ostream.h" +#include + +#define DEBUG_TYPE "shape-inference" + +using namespace mlir; +using namespace toy; + +/// Include the auto-generated definitions for the shape inference interfaces. +#include "toy/ShapeInferenceOpInterfaces.cpp.inc" + +namespace { +/// The ShapeInferencePass is a pass that performs intra-procedural +/// shape inference. +/// +/// Algorithm: +/// +/// 1) Build a worklist containing all the operations that return a +/// dynamically shaped tensor: these are the operations that need shape +/// inference. +/// 2) Iterate on the worklist: +/// a) find an operation to process: the next ready operation in the +/// worklist has all of its arguments non-generic, +/// b) if no operation is found, break out of the loop, +/// c) remove the operation from the worklist, +/// d) infer the shape of its output from the argument types. +/// 3) If the worklist is empty, the algorithm succeeded. +/// +struct ShapeInferencePass + : public mlir::PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ShapeInferencePass) + StringRef getArgument() const override { return "toy-shape-inference"; } + + void runOnOperation() override { + auto f = getOperation(); + + // Populate the worklist with the operations that need shape inference: + // these are operations that return a dynamic shape. + llvm::SmallPtrSet opWorklist; + f.walk([&](mlir::Operation *op) { + if (returnsDynamicShape(op)) + opWorklist.insert(op); + }); + + // Iterate on the operations in the worklist until all operations have been + // inferred or no change happened (fix point). + while (!opWorklist.empty()) { + // Find the next operation ready for inference, that is an operation + // with all operands already resolved (non-generic). + auto nextop = llvm::find_if(opWorklist, allOperandsInferred); + if (nextop == opWorklist.end()) + break; + + Operation *op = *nextop; + opWorklist.erase(op); + + // Ask the operation to infer its output shapes. + LDBG() << "Inferring shape for: " << *op; + if (auto shapeOp = dyn_cast(op)) { + shapeOp.inferShapes(); + } else { + op->emitError("unable to infer shape of operation without shape " + "inference interface"); + return signalPassFailure(); + } + } + + // If the operation worklist isn't empty, this indicates a failure. + if (!opWorklist.empty()) { + f.emitError("Shape inference failed, ") + << opWorklist.size() << " operations couldn't be inferred\n"; + signalPassFailure(); + } + } + + /// A utility method that returns if the given operation has all of its + /// operands inferred. + static bool allOperandsInferred(Operation *op) { + return llvm::all_of(op->getOperandTypes(), [](Type operandType) { + return llvm::isa(operandType); + }); + } + + /// A utility method that returns if the given operation has a dynamically + /// shaped result. + static bool returnsDynamicShape(Operation *op) { + return llvm::any_of(op->getResultTypes(), [](Type resultType) { + return !llvm::isa(resultType); + }); + } +}; +} // namespace + +/// Create a Shape Inference pass. +std::unique_ptr mlir::toy::createShapeInferencePass() { + return std::make_unique(); +} diff --git a/mlir/cuda-tile/Toy/mlir/ToyCombine.cpp b/mlir/cuda-tile/Toy/mlir/ToyCombine.cpp new file mode 100644 index 0000000..f8397c2 --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/ToyCombine.cpp @@ -0,0 +1,68 @@ +//===- ToyCombine.cpp - Toy High Level Optimizer --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a set of simple combiners for optimizing operations in +// the Toy dialect. +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Value.h" +#include "toy/Dialect.h" +using namespace mlir; +using namespace toy; + +namespace { +/// Include the patterns defined in the Declarative Rewrite framework. +#include "ToyCombine.inc" +} // namespace + +/// This is an example of a c++ rewrite pattern for the TransposeOp. It +/// optimizes the following scenario: transpose(transpose(x)) -> x +struct SimplifyRedundantTranspose : public mlir::OpRewritePattern { + /// We register this pattern to match every toy.transpose in the IR. + /// The "benefit" is used by the framework to order the patterns and process + /// them in order of profitability. + SimplifyRedundantTranspose(mlir::MLIRContext *context) + : OpRewritePattern(context, /*benefit=*/1) {} + + /// This method attempts to match a pattern and rewrite it. The rewriter + /// argument is the orchestrator of the sequence of rewrites. The pattern is + /// expected to interact with it to perform any changes to the IR from here. + llvm::LogicalResult + matchAndRewrite(TransposeOp op, + mlir::PatternRewriter &rewriter) const override { + // Look through the input of the current transpose. + mlir::Value transposeInput = op.getOperand(); + TransposeOp transposeInputOp = transposeInput.getDefiningOp(); + + // Input defined by another transpose? If not, no match. + if (!transposeInputOp) + return failure(); + + // Otherwise, we have a redundant transpose. Use the rewriter. + rewriter.replaceOp(op, {transposeInputOp.getOperand()}); + return success(); + } +}; + +/// Register our patterns as "canonicalization" patterns on the TransposeOp so +/// that they can be picked up by the Canonicalization framework. +void TransposeOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add(context); +} + +/// Register our patterns as "canonicalization" patterns on the ReshapeOp so +/// that they can be picked up by the Canonicalization framework. +void ReshapeOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add(context); +} diff --git a/mlir/cuda-tile/Toy/mlir/ToyCombine.td b/mlir/cuda-tile/Toy/mlir/ToyCombine.td new file mode 100644 index 0000000..11d7831 --- /dev/null +++ b/mlir/cuda-tile/Toy/mlir/ToyCombine.td @@ -0,0 +1,63 @@ +//===- ToyCombine.td - Pattern Match Optimizations for Toy -*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines language-specific pattern match optimizations for Toy using +// Declarative Rewrite Rules (DRR) specified using TableGen records. +// +//===----------------------------------------------------------------------===// + +#ifndef TOY_COMBINE +#define TOY_COMBINE + +include "mlir/IR/PatternBase.td" +include "toy/Ops.td" + +/// Note: The DRR definition used for defining patterns is shown below: +/// +/// class Pattern< +/// dag sourcePattern, list resultPatterns, +/// list additionalConstraints = [], +/// dag benefitsAdded = (addBenefit 0) +/// >; + +//===----------------------------------------------------------------------===// +// Basic Pattern-Match and Rewrite +//===----------------------------------------------------------------------===// + +// Reshape(Reshape(x)) = Reshape(x) +def ReshapeReshapeOptPattern : Pat<(ReshapeOp(ReshapeOp $arg)), + (ReshapeOp $arg)>; + +//===----------------------------------------------------------------------===// +// Pattern-Match and Rewrite using Native Code Call +//===----------------------------------------------------------------------===// + +// Native Code Calls may be used for more complex transformations using inline +// C++ and C++ helper functions. + +// Reshape(Constant(x)) = x' +def ReshapeConstant : + NativeCodeCall<"$0.reshape(::llvm::cast($1.getType()))">; +def FoldConstantReshapeOptPattern : Pat< + (ReshapeOp:$res (ConstantOp $arg)), + (ConstantOp (ReshapeConstant $arg, $res))>; + +//===----------------------------------------------------------------------===// +// Pattern-Match and Rewrite with Constraints +//===----------------------------------------------------------------------===// + +// DRR allows for constraint checking when the transformation is conditional +// on operand properties. + +// Reshape(x) = x, where input and output shapes are identical +def TypesAreIdentical : Constraint>; +def RedundantReshapeOptPattern : Pat< + (ReshapeOp:$res $arg), (replaceWithValue $arg), + [(TypesAreIdentical $res, $arg)]>; + +#endif // TOY_COMBINE diff --git a/mlir/cuda-tile/Toy/parser/AST.cpp b/mlir/cuda-tile/Toy/parser/AST.cpp new file mode 100644 index 0000000..8416424 --- /dev/null +++ b/mlir/cuda-tile/Toy/parser/AST.cpp @@ -0,0 +1,237 @@ +//===- AST.cpp - Helper for printing out the Toy AST ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the AST dump for the Toy language. +// +//===----------------------------------------------------------------------===// + +#include "toy/AST.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace toy; + +namespace { + +// RAII helper to manage increasing/decreasing the indentation as we traverse +// the AST +struct Indent { + Indent(int &level) : level(level) { ++level; } + ~Indent() { --level; } + int &level; +}; + +/// Helper class that implement the AST tree traversal and print the nodes along +/// the way. The only data member is the current indentation level. +class ASTDumper { +public: + void dump(ModuleAST *node); + +private: + void dump(const VarType &type); + void dump(VarDeclExprAST *varDecl); + void dump(ExprAST *expr); + void dump(ExprASTList *exprList); + void dump(NumberExprAST *num); + void dump(LiteralExprAST *node); + void dump(VariableExprAST *node); + void dump(ReturnExprAST *node); + void dump(BinaryExprAST *node); + void dump(CallExprAST *node); + void dump(PrintExprAST *node); + void dump(PrototypeAST *node); + void dump(FunctionAST *node); + + // Actually print spaces matching the current indentation level + void indent() { + for (int i = 0; i < curIndent; i++) + llvm::errs() << " "; + } + int curIndent = 0; +}; + +} // namespace + +/// Return a formatted string for the location of any node +template +static std::string loc(T *node) { + const auto &loc = node->loc(); + return (llvm::Twine("@") + *loc.file + ":" + llvm::Twine(loc.line) + ":" + + llvm::Twine(loc.col)) + .str(); +} + +// Helper Macro to bump the indentation level and print the leading spaces for +// the current indentations +#define INDENT() \ + Indent level_(curIndent); \ + indent(); + +/// Dispatch to a generic expressions to the appropriate subclass using RTTI +void ASTDumper::dump(ExprAST *expr) { + llvm::TypeSwitch(expr) + .Case( + [&](auto *node) { this->dump(node); }) + .Default([&](ExprAST *) { + // No match, fallback to a generic message + INDENT(); + llvm::errs() << "getKind() << ">\n"; + }); +} + +/// A variable declaration is printing the variable name, the type, and then +/// recurse in the initializer value. +void ASTDumper::dump(VarDeclExprAST *varDecl) { + INDENT(); + llvm::errs() << "VarDecl " << varDecl->getName(); + dump(varDecl->getType()); + llvm::errs() << " " << loc(varDecl) << "\n"; + dump(varDecl->getInitVal()); +} + +/// A "block", or a list of expression +void ASTDumper::dump(ExprASTList *exprList) { + INDENT(); + llvm::errs() << "Block {\n"; + for (auto &expr : *exprList) + dump(expr.get()); + indent(); + llvm::errs() << "} // Block\n"; +} + +/// A literal number, just print the value. +void ASTDumper::dump(NumberExprAST *num) { + INDENT(); + llvm::errs() << num->getValue() << " " << loc(num) << "\n"; +} + +/// Helper to print recursively a literal. This handles nested array like: +/// [ [ 1, 2 ], [ 3, 4 ] ] +/// We print out such array with the dimensions spelled out at every level: +/// <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ] +static void printLitHelper(ExprAST *litOrNum) { + // Inside a literal expression we can have either a number or another literal + if (auto *num = llvm::dyn_cast(litOrNum)) { + llvm::errs() << num->getValue(); + return; + } + auto *literal = llvm::cast(litOrNum); + + // Print the dimension for this literal first + llvm::errs() << "<"; + llvm::interleaveComma(literal->getDims(), llvm::errs()); + llvm::errs() << ">"; + + // Now print the content, recursing on every element of the list + llvm::errs() << "[ "; + llvm::interleaveComma(literal->getValues(), llvm::errs(), + [&](auto &elt) { printLitHelper(elt.get()); }); + llvm::errs() << "]"; +} + +/// Print a literal, see the recursive helper above for the implementation. +void ASTDumper::dump(LiteralExprAST *node) { + INDENT(); + llvm::errs() << "Literal: "; + printLitHelper(node); + llvm::errs() << " " << loc(node) << "\n"; +} + +/// Print a variable reference (just a name). +void ASTDumper::dump(VariableExprAST *node) { + INDENT(); + llvm::errs() << "var: " << node->getName() << " " << loc(node) << "\n"; +} + +/// Return statement print the return and its (optional) argument. +void ASTDumper::dump(ReturnExprAST *node) { + INDENT(); + llvm::errs() << "Return\n"; + if (node->getExpr().has_value()) + return dump(*node->getExpr()); + { + INDENT(); + llvm::errs() << "(void)\n"; + } +} + +/// Print a binary operation, first the operator, then recurse into LHS and RHS. +void ASTDumper::dump(BinaryExprAST *node) { + INDENT(); + llvm::errs() << "BinOp: " << node->getOp() << " " << loc(node) << "\n"; + dump(node->getLHS()); + dump(node->getRHS()); +} + +/// Print a call expression, first the callee name and the list of args by +/// recursing into each individual argument. +void ASTDumper::dump(CallExprAST *node) { + INDENT(); + llvm::errs() << "Call '" << node->getCallee() << "' [ " << loc(node) << "\n"; + for (auto &arg : node->getArgs()) + dump(arg.get()); + indent(); + llvm::errs() << "]\n"; +} + +/// Print a builtin print call, first the builtin name and then the argument. +void ASTDumper::dump(PrintExprAST *node) { + INDENT(); + llvm::errs() << "Print [ " << loc(node) << "\n"; + dump(node->getArg()); + indent(); + llvm::errs() << "]\n"; +} + +/// Print type: only the shape is printed in between '<' and '>' +void ASTDumper::dump(const VarType &type) { + llvm::errs() << "<"; + llvm::interleaveComma(type.shape, llvm::errs()); + llvm::errs() << ">"; +} + +/// Print a function prototype, first the function name, and then the list of +/// parameters names. +void ASTDumper::dump(PrototypeAST *node) { + INDENT(); + llvm::errs() << "Proto '" << node->getName() << "' " << loc(node) << "\n"; + indent(); + llvm::errs() << "Params: ["; + llvm::interleaveComma(node->getArgs(), llvm::errs(), + [](auto &arg) { llvm::errs() << arg->getName(); }); + llvm::errs() << "]\n"; +} + +/// Print a function, first the prototype and then the body. +void ASTDumper::dump(FunctionAST *node) { + INDENT(); + llvm::errs() << "Function \n"; + dump(node->getProto()); + dump(node->getBody()); +} + +/// Print a module, actually loop over the functions and print them in sequence. +void ASTDumper::dump(ModuleAST *node) { + INDENT(); + llvm::errs() << "Module:\n"; + for (auto &f : *node) + dump(&f); +} + +namespace toy { + +// Public API +void dump(ModuleAST &module) { ASTDumper().dump(&module); } + +} // namespace toy diff --git a/mlir/cuda-tile/Toy/toyc.cpp b/mlir/cuda-tile/Toy/toyc.cpp new file mode 100644 index 0000000..be27585 --- /dev/null +++ b/mlir/cuda-tile/Toy/toyc.cpp @@ -0,0 +1,441 @@ +//===- toyc.cpp - The Toy Compiler ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the entry point for the Toy compiler. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Func/Extensions/AllExtensions.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.h" +#include "toy/AST.h" +#include "toy/Dialect.h" +#include "toy/Lexer.h" +#include "toy/MLIRGen.h" +#include "toy/Parser.h" +#include "toy/Passes.h" + +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/LLVMIR/Transforms/Passes.h" +#include "mlir/ExecutionEngine/ExecutionEngine.h" +#include "mlir/ExecutionEngine/OptUtils.h" +#include "mlir/IR/AsmState.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Verifier.h" +#include "mlir/InitAllDialects.h" +#include "mlir/Parser/Parser.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Transforms/Passes.h" + +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include + +using namespace toy; +namespace cl = llvm::cl; + +static cl::opt inputFilename(cl::Positional, + cl::desc(""), + cl::init("-"), + cl::value_desc("filename")); + +namespace { +enum InputType { Toy, MLIR }; +} // namespace +static cl::opt inputType( + "x", cl::init(Toy), cl::desc("Decided the kind of output desired"), + cl::values(clEnumValN(Toy, "toy", "load the input file as a Toy source.")), + cl::values(clEnumValN(MLIR, "mlir", + "load the input file as an MLIR file"))); + +namespace { +enum Action { + None, + DumpAST, + DumpMLIR, + DumpMLIRAffine, + DumpMLIRLLVM, + DumpLLVMIR, + RunJIT, + DumpGpuIR, + DumpCudaTileIR, + DumpGpuAffine, + DumpGPULLVMIR, + RunNVGPUJIT +}; +} // namespace +static cl::opt emitAction( + "emit", cl::desc("Select the kind of output desired"), + cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")), + cl::values(clEnumValN(DumpMLIR, "mlir", "output the MLIR dump")), + cl::values(clEnumValN(DumpMLIRAffine, "mlir-affine", + "output the MLIR dump after affine lowering")), + cl::values(clEnumValN(DumpMLIRLLVM, "mlir-llvm", + "output the MLIR dump after llvm lowering")), + cl::values(clEnumValN(DumpLLVMIR, "llvm", "output the LLVM IR dump")), + cl::values( + clEnumValN(RunJIT, "jit", + "JIT the code and run it by invoking the main function")), + cl::values(clEnumValN(DumpGpuIR, "gpu-ir", + "output the GPU dialect MLIR dump")), + cl::values(clEnumValN(DumpCudaTileIR, "cuda-tile-ir", + "output the Cuda Tile dialect MLIR dump")), + cl::values(clEnumValN(DumpGpuAffine, "gpu-affine", + "output the GPU dialect MLIR dump after affine " + "lowering")), + cl::values(clEnumValN(DumpGPULLVMIR, "gpu-llvm", + "output the GPU LLVM dialect MLIR dump")), + cl::values(clEnumValN(RunNVGPUJIT, "nv-gpu-jit", + "JIT the code for NVGPU and run it by invoking the " + "main function"))); + +static cl::opt assignGrid("grid", cl::init("1,1,1"), + cl::desc("Assign the grid dimensions")); + +static cl::opt enableOpt("opt", cl::desc("Enable optimizations")); + +/// Returns a Toy AST resulting from parsing the file or a nullptr on error. +static std::unique_ptr +parseInputFile(llvm::StringRef filename) { + llvm::ErrorOr> fileOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(filename); + if (std::error_code ec = fileOrErr.getError()) { + llvm::errs() << "Could not open input file: " << ec.message() << "\n"; + return nullptr; + } + auto buffer = fileOrErr.get()->getBuffer(); + LexerBuffer lexer(buffer.begin(), buffer.end(), std::string(filename)); + Parser parser(lexer); + return parser.parseModule(); +} + +static int loadMLIR(mlir::MLIRContext &context, + mlir::OwningOpRef &module) { + // Handle '.toy' input to the compiler. + if (inputType != InputType::MLIR && + !llvm::StringRef(inputFilename).ends_with(".mlir")) { + auto moduleAST = parseInputFile(inputFilename); + if (!moduleAST) + return 6; + module = mlirGen(context, *moduleAST); + return !module ? 1 : 0; + } + + // Otherwise, the input is '.mlir'. + llvm::ErrorOr> fileOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(inputFilename); + if (std::error_code ec = fileOrErr.getError()) { + llvm::errs() << "Could not open input file: " << ec.message() << "\n"; + return -1; + } + + // Parse the input mlir. + llvm::SourceMgr sourceMgr; + sourceMgr.AddNewSourceBuffer(std::move(*fileOrErr), llvm::SMLoc()); + module = mlir::parseSourceFile(sourceMgr, &context); + if (!module) { + llvm::errs() << "Error can't load file " << inputFilename << "\n"; + return 3; + } + return 0; +} + +static int loadAndProcessMLIR(mlir::MLIRContext &context, + mlir::OwningOpRef &module) { + if (int error = loadMLIR(context, module)) + return error; + + mlir::PassManager pm(module.get()->getName()); + // Apply any generic pass manager command line options and run the pipeline. + if (mlir::failed(mlir::applyPassManagerCLOptions(pm))) + return 4; + + // Check to see what granularity of MLIR we are compiling to. + bool isLoweringToAffine = emitAction >= Action::DumpMLIRAffine; + bool isLoweringToLLVM = emitAction >= Action::DumpMLIRLLVM; + + if (enableOpt || isLoweringToAffine) { + // Inline all functions into main and then delete them. + pm.addPass(mlir::createInlinerPass()); + + // Now that there is only one function, we can infer the shapes of each of + // the operations. + mlir::OpPassManager &optPM = pm.nest(); + optPM.addPass(mlir::toy::createShapeInferencePass()); + optPM.addPass(mlir::createCanonicalizerPass()); + optPM.addPass(mlir::createCSEPass()); + } + + if (isLoweringToAffine) { + // Partially lower the toy dialect. + pm.addPass(mlir::toy::createLowerToAffinePass()); + + // Add a few cleanups post lowering. + mlir::OpPassManager &optPM = pm.nest(); + optPM.addPass(mlir::createCanonicalizerPass()); + optPM.addPass(mlir::createCSEPass()); + + // Add optimizations if enabled. + if (enableOpt) { + optPM.addPass(mlir::affine::createLoopFusionPass()); + optPM.addPass(mlir::affine::createAffineScalarReplacementPass()); + } + } + + if (isLoweringToLLVM) { + // Finish lowering the toy IR to the LLVM dialect. + pm.addPass(mlir::toy::createLowerToLLVMPass()); + // This is necessary to have line tables emitted and basic + // debugger working. In the future we will add proper debug information + // emission directly from our frontend. + pm.addPass(mlir::LLVM::createDIScopeForLLVMFuncOpPass()); + } + + if (mlir::failed(pm.run(*module))) + return 4; + return 0; +} + +static int dumpAST() { + if (inputType == InputType::MLIR) { + llvm::errs() << "Can't dump a Toy AST when the input is MLIR\n"; + return 5; + } + + auto moduleAST = parseInputFile(inputFilename); + if (!moduleAST) + return 1; + + dump(*moduleAST); + return 0; +} + +static int dumpLLVMIR(mlir::ModuleOp module) { + // Register the translation to LLVM IR with the MLIR context. + mlir::registerBuiltinDialectTranslation(*module->getContext()); + mlir::registerLLVMDialectTranslation(*module->getContext()); + + // Convert the module to LLVM IR in a new LLVM IR context. + llvm::LLVMContext llvmContext; + auto llvmModule = mlir::translateModuleToLLVMIR(module, llvmContext); + if (!llvmModule) { + llvm::errs() << "Failed to emit LLVM IR\n"; + return -1; + } + + // Initialize LLVM targets. + llvm::InitializeNativeTarget(); + llvm::InitializeNativeTargetAsmPrinter(); + + // Configure the LLVM Module + auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost(); + if (!tmBuilderOrError) { + llvm::errs() << "Could not create JITTargetMachineBuilder\n"; + return -1; + } + + auto tmOrError = tmBuilderOrError->createTargetMachine(); + if (!tmOrError) { + llvm::errs() << "Could not create TargetMachine\n"; + return -1; + } + mlir::ExecutionEngine::setupTargetTripleAndDataLayout(llvmModule.get(), + tmOrError.get().get()); + + /// Optionally run an optimization pipeline over the llvm module. + auto optPipeline = mlir::makeOptimizingTransformer( + /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0, + /*targetMachine=*/nullptr); + if (auto err = optPipeline(llvmModule.get())) { + llvm::errs() << "Failed to optimize LLVM IR " << err << "\n"; + return -1; + } + llvm::errs() << *llvmModule << "\n"; + return 0; +} + +static int runJit(mlir::ModuleOp module) { + // Initialize LLVM targets. + llvm::InitializeNativeTarget(); + llvm::InitializeNativeTargetAsmPrinter(); + + // Register the translation from MLIR to LLVM IR, which must happen before we + // can JIT-compile. + mlir::registerBuiltinDialectTranslation(*module->getContext()); + mlir::registerLLVMDialectTranslation(*module->getContext()); + + // An optimization pipeline to use within the execution engine. + auto optPipeline = mlir::makeOptimizingTransformer( + /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0, + /*targetMachine=*/nullptr); + + // Create an MLIR execution engine. The execution engine eagerly JIT-compiles + // the module. + mlir::ExecutionEngineOptions engineOptions; + engineOptions.transformer = optPipeline; + auto maybeEngine = mlir::ExecutionEngine::create(module, engineOptions); + assert(maybeEngine && "failed to construct an execution engine"); + auto &engine = maybeEngine.get(); + + // Invoke the JIT-compiled function. + auto invocationResult = engine->invokePacked("main"); + if (invocationResult) { + llvm::errs() << "JIT invocation failed\n"; + return -1; + } + + return 0; +} + +static int loadAndProcessMLIRGPU(mlir::MLIRContext &context, + mlir::OwningOpRef &module) { + llvm::SmallSet gpuOperations = {"matmul", "add", "mul", + "transpose"}; + if (int error = loadMLIR(context, module)) + return error; + + mlir::PassManager pm(module.get()->getName()); + // Apply any generic pass manager command line options and run the pipeline. + if (mlir::failed(mlir::applyPassManagerCLOptions(pm))) + return 4; + + // Inline all functions into main and then delete them. + pm.addPass(mlir::createInlinerPass()); + + // Now that there is only one function, we can infer the shapes of each of + // the operations. + mlir::OpPassManager &optPM = pm.nest(); + optPM.addPass(mlir::toy::createShapeInferencePass()); + optPM.addPass(mlir::createCanonicalizerPass()); + optPM.addPass(mlir::createCSEPass()); + + // Now process the toy mlir with gpu outline pass. + optPM.addPass(mlir::toy::createGpuOutlinePass(assignGrid)); + // mlir::OpPassManager &gpuOptPM = pm.nest(); + pm.addPass(mlir::toy::createCudaTileLoweringPass()); + pm.addPass(mlir::createCSEPass()); + + // pm.addPass(mlir::toy::createLowerGpuHostToLLVMPass()); + bool isLoweringToAffine = emitAction >= Action::DumpGpuAffine; + if (isLoweringToAffine) { + pm.addPass(mlir::toy::createEmbedCudaTileBinaryPass( + "/usr/local/cuda/bin/tileiras", "sm_120")); + + // mlir::OpPassManager &gpuOptPM = pm.nest(); + // // Partially lower the toy dialect. + // pm.addPass(mlir::toy::createLowerToAffinePass()); + + // // Add a few cleanups post lowering. + // mlir::OpPassManager &optPM = pm.nest(); + // optPM.addPass(mlir::createCanonicalizerPass()); + // optPM.addPass(mlir::createCSEPass()); + + // // Add optimizations if enabled. + // if (enableOpt) { + // optPM.addPass(mlir::affine::createLoopFusionPass()); + // optPM.addPass(mlir::affine::createAffineScalarReplacementPass()); + // } + } + + if (mlir::failed(pm.run(*module))) + return 4; + return 0; +} + +static int dumpGpuLLVMIR(mlir::ModuleOp module) { + // Simply dump the MLIR module at this stage. + module.dump(); + return 0; +} + +static int runGpuJit(mlir::ModuleOp module) { return 0; } + +int main(int argc, char **argv) { + // Register any command line options. + mlir::registerAsmPrinterCLOptions(); + mlir::registerMLIRContextCLOptions(); + mlir::registerPassManagerCLOptions(); + + cl::ParseCommandLineOptions(argc, argv, "toy compiler\n"); + + if (emitAction == Action::DumpAST) + return dumpAST(); + + // If we aren't dumping the AST, then we are compiling with/to MLIR. + mlir::DialectRegistry registry; + mlir::func::registerAllExtensions(registry); + mlir::LLVM::registerInlinerInterface(registry); + + mlir::MLIRContext context(registry); + // Load our Dialect in this MLIR Context. + context.getOrLoadDialect(); + + mlir::OwningOpRef module; + + if (emitAction > Action::RunJIT) { + llvm::outs() << "The GPU related actions will be used\n"; + llvm::outs() << "Grid dimensions: " << assignGrid << "\n"; + + if (int error = loadAndProcessMLIRGPU(context, module)) + return error; + + // If we aren't exporting to non-mlir, then we are done. + bool isOutputingMLIR = emitAction <= Action::RunNVGPUJIT; + if (isOutputingMLIR) { + module->dump(); + return 0; + } + + if (emitAction == Action::DumpGPULLVMIR) + return dumpGpuLLVMIR(*module); + + if (emitAction == Action::RunNVGPUJIT) + return runGpuJit(*module); + + llvm::errs() << "No action specified (parsing only?), use -emit=\n"; + return -1; + } + + if (int error = loadAndProcessMLIR(context, module)) + return error; + + // If we aren't exporting to non-mlir, then we are done. + bool isOutputingMLIR = emitAction <= Action::DumpMLIRLLVM; + if (isOutputingMLIR) { + module->dump(); + return 0; + } + + // Check to see if we are compiling to LLVM IR. + if (emitAction == Action::DumpLLVMIR) + return dumpLLVMIR(*module); + + // Otherwise, we must be running the jit. + if (emitAction == Action::RunJIT) + return runJit(*module); + + llvm::errs() << "No action specified (parsing only?), use -emit=\n"; + return -1; +} diff --git a/mlir/cuda-tile/build.sh b/mlir/cuda-tile/build.sh new file mode 100644 index 0000000..0a84b3a --- /dev/null +++ b/mlir/cuda-tile/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +_target=${1:-'all'} + +rm -rf build +mkdir build + +_workspaceFolder=$(pwd) + +cd build + +# For non-conda users: +cmake .. -Wno-dev -G Ninja \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE \ + -DCMAKE_BUILD_TYPE:STRING=Debug \ + -DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc \ + -DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++ \ + -DMLIR_DIR=${_workspaceFolder}/third_party/llvm/lib/cmake/mlir \ + -DLLVM_DIR=${_workspaceFolder}/third_party/llvm/lib/cmake/llvm \ + -DCMAKE_MODULE_PATH="${_workspaceFolder}/third_party/llvm/lib/cmake/mlir;${_workspaceFolder}/third_party/llvm/lib/cmake/llvm" \ + -DMLIR_TABLEGEN_EXE=${_workspaceFolder}/third_party/llvm/bin/mlir-tblgen \ + -DCUDA_TILE_BINARY_DIR=${_workspaceFolder}/third_party/cuda-tile/build/ \ + -DCUDA_TILE_SOURCE_DIR=${_workspaceFolder}/third_party/cuda-tile + +# ninja +cmake \ + --build ${_workspaceFolder}/build \ + --config Debug --target ${_target} diff --git a/mlir/cuda-tile/build_with_conda.sh b/mlir/cuda-tile/build_with_conda.sh new file mode 100644 index 0000000..c81f22d --- /dev/null +++ b/mlir/cuda-tile/build_with_conda.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +_target=${1:-'all'} + +rm -rf build +mkdir build + +_workspaceFolder=$(pwd) + +cd build + + +cmake .. -G Ninja --no-warn-unused-cli \ + -Wno-dev \ + -DCMAKE_MODULE_PATH="/root/miniconda3/envs/mlir/lib/cmake/mlir;/root/miniconda3/envs/mlir/lib/cmake/llvm" \ + -DMLIR_TABLEGEN_EXE:FILEPATH=/root/miniconda3/envs/mlir/bin/mlir-tblgen \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE \ + -DCMAKE_BUILD_TYPE:STRING=Debug \ + -DCMAKE_C_COMPILER:FILEPATH=/usr/bin/gcc \ + -DCMAKE_CXX_COMPILER:FILEPATH=/usr/bin/g++ + +# ninja +cmake \ + --build ${_workspaceFolder}/build \ + --config Debug --target ${_target} diff --git a/mlir/cuda-tile/cuda_shim/cuda_shim.cc b/mlir/cuda-tile/cuda_shim/cuda_shim.cc new file mode 100644 index 0000000..9a38a9b --- /dev/null +++ b/mlir/cuda-tile/cuda_shim/cuda_shim.cc @@ -0,0 +1,528 @@ +//===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements C wrappers around the CUDA library for easy linking in ORC jit. +// Also adds some debugging helpers that are helpful when writing MLIR code to +// run on GPUs. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "cuda.h" +#include "cuda_bf16.h" +#include "cuda_fp16.h" +#include + +// We assume the program runs on the linux platform if not on Windows. +// Copy from +// third_party/llvm-project/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp + +// #if CUDA_VERSION >= 13000 + +#define MLIR_CUDA_WRAPPERS_EXPORT __attribute__((visibility("default"))) + +#define CUDA_REPORT_IF_ERROR(expr) \ + [](CUresult result) { \ + if (!result) \ + return; \ + const char *name = nullptr; \ + cuGetErrorName(result, &name); \ + if (!name) \ + name = ""; \ + fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ + }(expr) + +thread_local static int32_t defaultDevice = 0; + +/// Helper method that checks environment value for debugging. +static bool isDebugEnabled() { + const char *kDebugEnvironmentVariable = "MLIR_CUDA_DEBUG"; + static bool isEnabled = getenv(kDebugEnvironmentVariable) != nullptr; + return isEnabled; +} + +#define debug_print(fmt, ...) \ + do { \ + if (isDebugEnabled()) \ + fprintf(stderr, "%s:%d:%s(): " fmt, "CudaRuntimeWrappers.cpp", __LINE__, \ + __func__, __VA_ARGS__); \ + } while (0) + +// Returns default CUdevice +static CUdevice getDefaultCuDevice() { + CUdevice device; + CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); + return device; +} + +// Make the primary context of the current default device current for the +// duration +// of the instance and restore the previous context on destruction. +class ScopedContext { +public: + ScopedContext() { + // Static reference to CUDA primary context for device ordinal + // defaultDevice. + static CUcontext context = [] { + CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0)); + CUcontext ctx; + // Note: this does not affect the current context. + CUDA_REPORT_IF_ERROR( + cuDevicePrimaryCtxRetain(&ctx, getDefaultCuDevice())); + return ctx; + }(); + + CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context)); + } + + ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } +}; + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule +mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { + ScopedContext scopedContext; + CUmodule module = nullptr; + CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); + return module; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data, + int optLevel) { + ScopedContext scopedContext; + CUmodule module = nullptr; + char jitErrorBuffer[4096] = {0}; + CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_OPTIMIZATION_LEVEL}; + void *jitOptionsVals[] = {jitErrorBuffer, + reinterpret_cast(sizeof(jitErrorBuffer)), + reinterpret_cast(optLevel)}; + + CUresult result = + cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals); + if (result) { + fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer); + CUDA_REPORT_IF_ERROR(result); + } + return module; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) { + CUDA_REPORT_IF_ERROR(cuModuleUnload(module)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction +mgpuModuleGetFunction(CUmodule module, const char *name) { + CUfunction function = nullptr; + CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); + return function; +} + +// The wrapper uses intptr_t instead of CUDA's unsigned int to match +// the type of MLIR's index type. This avoids the need for casts in the +// generated MLIR code. +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, + intptr_t gridZ, intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, CUstream stream, void **params, + void **extra, size_t /*paramsCount*/) { + ScopedContext scopedContext; + if (smem > 0) { + // Avoid checking driver as it's more expensive than if statement + int32_t maxShmem = 0; + CUdevice device = getDefaultCuDevice(); + CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); + CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute( + &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, + device)); + if (maxShmem < smem) { + fprintf(stderr, + "Requested shared memory (%dkb) is larger than maximum allowed " + "shared memory (%dkb) for this device\n", + smem, maxShmem); + } + CUDA_REPORT_IF_ERROR(cuFuncSetAttribute( + function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem)); + } + debug_print("Launching kernel, grid=%ld,%ld,%ld, " + "threads: %ld, %ld, %ld, " + "smem: %dkb\n", + gridX, gridY, gridZ, blockX, blockY, blockZ, smem); + CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, + blockY, blockZ, smem, stream, params, + extra)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() { + ScopedContext scopedContext; + CUstream stream = nullptr; + CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + return stream; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) { + CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuStreamSynchronize(CUstream stream) { + CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream, + CUevent event) { + CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() { + ScopedContext scopedContext; + CUevent event = nullptr; + CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); + return event; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) { + CUDA_REPORT_IF_ERROR(cuEventDestroy(event)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventSynchronize(CUevent event) { + CUDA_REPORT_IF_ERROR(cuEventSynchronize(event)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventRecord(CUevent event, + CUstream stream) { + CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuMemAlloc(uint64_t sizeBytes, CUstream stream, bool isHostShared) { + ScopedContext scopedContext; + CUdeviceptr ptr = 0; + if (sizeBytes == 0) + return reinterpret_cast(ptr); + + if (isHostShared) { + CUDA_REPORT_IF_ERROR( + cuMemAllocManaged(&ptr, sizeBytes, CU_MEM_ATTACH_GLOBAL)); + return reinterpret_cast(ptr); + } + CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); + return reinterpret_cast(ptr); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemFree(void *ptr, + CUstream /*stream*/) { + CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast(ptr))); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemcpy(void *dst, void *src, size_t sizeBytes, CUstream stream) { + CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast(dst), + reinterpret_cast(src), + sizeBytes, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemset32(void *dst, unsigned int value, size_t count, CUstream stream) { + CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast(dst), + value, count, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuMemset16(void *dst, unsigned short value, size_t count, CUstream stream) { + CUDA_REPORT_IF_ERROR(cuMemsetD16Async(reinterpret_cast(dst), + value, count, stream)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { + defaultDevice = device; +} + +// ===----------------------------------------------------------------------===// + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCtxSynchronize() { + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuCtxSynchronize()); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyHtoD(void *dst, void *src, + size_t sizeBytes) { + CUDA_REPORT_IF_ERROR( + cuMemcpyHtoD(reinterpret_cast(dst), src, sizeBytes)); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuMemcpyDtoH(void *dst, void *src, + size_t sizeBytes) { + CUDA_REPORT_IF_ERROR( + cuMemcpyDtoH(dst, reinterpret_cast(src), sizeBytes)); +} + +//===----------------------------------------------------------------------===// + +static inline CUdeviceptr asDevPtr(uint64_t h) { + return static_cast(h); +} +static inline uint64_t asHandle(CUdeviceptr p) { + return static_cast(p); +} + +static inline CUstream asStream(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} +static inline uint64_t asStreamHandle(CUstream s) { + return static_cast(reinterpret_cast(s)); +} + +static inline CUevent asEvent(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} +static inline uint64_t asEventHandle(CUevent e) { + return static_cast(reinterpret_cast(e)); +} + +static inline void *asHostPtr(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} +static inline const void *asHostCPtr(uint64_t h) { + return reinterpret_cast(static_cast(h)); +} + +// Align up helper +static inline uint64_t alignUp(uint64_t x, uint64_t a) { + return (x + (a - 1)) & ~(a - 1); +} + +// Load module from PTX or CUBIN image in memory. +// Driver API supports cuModuleLoadDataEx for both PTX and cubin (it +// auto-detects). +extern "C" uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr, + uint64_t image_nbytes) { + + (void)image_nbytes; + auto data = const_cast(asHostCPtr(image_ptr)); + CUmodule mod = mgpuModuleLoad(data, image_nbytes); + return static_cast(reinterpret_cast(mod)); +} + +extern "C" uint64_t cuda_shim_load_module_jit_from_image(uint64_t image_ptr, + uint64_t image_nbytes, + int opt_level) { + + (void)image_nbytes; + auto data = const_cast(asHostCPtr(image_ptr)); + CUmodule mod = mgpuModuleLoadJIT(data, opt_level); + return static_cast(reinterpret_cast(mod)); +} + +extern "C" uint64_t +cuda_shim_load_module_from_file(uint64_t file_path_ptr, + uint64_t /*file_path_nbytes*/) { + auto file_path_cstr = + reinterpret_cast(asHostCPtr(file_path_ptr)); + // fprintf(stdout, "%s", file_path_cstr); + CUmodule module = nullptr; + ScopedContext scopedContext; + CUDA_REPORT_IF_ERROR(cuModuleLoad(&module, file_path_cstr)); + return static_cast(reinterpret_cast(module)); +} + +extern "C" void cuda_shim_unload_module(uint64_t module_handle) { + CUmodule module = + reinterpret_cast(static_cast(module_handle)); + mgpuModuleUnload(module); +} + +extern "C" uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream, + bool is_host_shared) { + CUstream cu_stream = asStream(stream); + if (stream == 0) + cu_stream = nullptr; + void *ptr = mgpuMemAlloc(nbytes, /*stream=*/cu_stream, + /*isHostShared=*/is_host_shared); + return static_cast(reinterpret_cast(ptr)); +} + +extern "C" void cuda_shim_free(uint64_t dptr, uint64_t stream) { + CUstream cu_stream = asStream(stream); + void *ptr = reinterpret_cast(static_cast(dptr)); + if (stream == 0) { + cu_stream = nullptr; + } + mgpuMemFree(ptr, /*stream=*/cu_stream); +} + +extern "C" void cuda_shim_memset32(uint64_t dptr, uint32_t value, + uint64_t count_dwords, uint64_t stream) { + void *ptr = reinterpret_cast(static_cast(dptr)); + CUstream cu_stream = asStream(stream); + mgpuMemset32(ptr, value, count_dwords, cu_stream); +} + +extern "C" void cuda_shim_memset16(uint64_t dptr, uint32_t value, + uint64_t count_dwords, uint64_t stream) { + void *ptr = reinterpret_cast(static_cast(dptr)); + CUstream cu_stream = asStream(stream); + mgpuMemset16(ptr, value, count_dwords, cu_stream); +} + +extern "C" uint64_t cuda_shim_stream_create(void) { + CUstream stream = mgpuStreamCreate(); + return asStreamHandle(stream); +} + +extern "C" void cuda_shim_stream_destroy(uint64_t stream) { + CUstream cu_stream = asStream(stream); + mgpuStreamDestroy(cu_stream); +} + +extern "C" void cuda_shim_stream_synchronize(uint64_t stream) { + CUstream cu_stream = asStream(stream); + mgpuStreamSynchronize(cu_stream); +} + +extern "C" uint64_t cuda_shim_event_create(void) { + CUevent event = mgpuEventCreate(); + return asEventHandle(event); +} + +extern "C" void cuda_shim_event_destroy(uint64_t ev) { + CUevent event = asEvent(ev); + mgpuEventDestroy(event); +} + +extern "C" void cuda_shim_event_record(uint64_t ev, uint64_t stream) { + CUevent event = asEvent(ev); + CUstream cu_stream = asStream(stream); + mgpuEventRecord(event, cu_stream); +} + +extern "C" void cuda_shim_event_synchronize(uint64_t ev) { + CUevent event = asEvent(ev); + mgpuEventSynchronize(event); +} + +extern "C" void cuda_shim_stream_wait_event(uint64_t stream, uint64_t ev) { + CUstream cu_stream = asStream(stream); + CUevent event = asEvent(ev); + mgpuStreamWaitEvent(cu_stream, event); +} + +// ----------------------------- Memcpy (raw ABI) -------------------------- +// Host pointers are passed as uint64_t. This is the key of 2A. + +extern "C" void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr, + uint64_t nbytes) { + ScopedContext scopedContext; + auto dst = asHostPtr(dst_dptr); + auto src = asHostPtr(src_hptr); + mgpuMemcpyHtoD(dst, src, static_cast(nbytes)); +} + +extern "C" void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr, + uint64_t nbytes) { + ScopedContext scopedContext; + auto dst = asHostPtr(dst_hptr); + auto src = asHostPtr(src_dptr); + mgpuMemcpyDtoH(dst, src, static_cast(nbytes)); +} + +// ----------------------------- Kernel launch ----------------------------- +// The hardest part is kernelParams (void**). +// We avoid building it in MLIR. Instead MLIR passes: +// - arg_data_ptr: host pointer to a packed buffer containing raw argument bytes +// - arg_sizes_ptr: host pointer to uint64_t[num_args], each is the byte-size of +// that argument The shim constructs kernelParams[i] = &arg_data[offset_i] with +// 8-byte alignment. This matches typical ABI expectations for scalar/pointer +// args. If you have special alignment requirements, extend this (e.g., per-arg +// alignment array). + +extern "C" void cuda_shim_launch_packed( + uint64_t module_handle, uint64_t kernel_name_ptr, uint32_t gridX, + uint32_t gridY, uint32_t gridZ, uint32_t blockX, uint32_t blockY, + uint32_t blockZ, uint32_t sharedMemBytes, uint64_t stream, + uint64_t arg_data_ptr, uint64_t arg_sizes_ptr, uint32_t num_args) { + + auto mh = reinterpret_cast(static_cast(module_handle)); + if (!mh) { + fprintf(stderr, "[cuda_shim] launch_packed: invalid module handle\n"); + abort(); + } + + const char *kname = + reinterpret_cast(asHostCPtr(kernel_name_ptr)); + if (!kname) { + fprintf(stderr, "[cuda_shim] launch_packed: null kernel name\n"); + abort(); + } + + CUfunction fn = mgpuModuleGetFunction(mh, kname); + + auto *argData = reinterpret_cast(asHostPtr(arg_data_ptr)); + auto *argSizes = + reinterpret_cast(asHostCPtr(arg_sizes_ptr)); + + if (num_args > 0 && (!argData || !argSizes)) { + fprintf(stderr, "[cuda_shim] launch_packed: argData/argSizes null\n"); + abort(); + } + + // Build kernelParams array on heap (safe for large num_args). + std::vector params; + params.resize(num_args); + + uint64_t off = 0; + for (uint32_t i = 0; i < num_args; ++i) { + // 8-byte align each argument start (common safe default). + off = alignUp(off, 8); + params[i] = argData + off; + off += argSizes[i]; + } + + auto cu_stream = asStream(stream); + + if (stream == 0) { + cu_stream = nullptr; + } + + mgpuLaunchKernel(fn, static_cast(gridX), + static_cast(gridY), static_cast(gridZ), + static_cast(blockX), static_cast(blockY), + static_cast(blockZ), + static_cast(sharedMemBytes), cu_stream, + params.data(), nullptr, static_cast(num_args)); +} + +// Convenience: 1D launch, shared=0, stream optional +extern "C" void +cuda_shim_launch_block_packed(uint64_t module_handle, uint64_t kernel_name_ptr, + uint32_t blockX, uint32_t blockY, uint32_t blockZ, + uint64_t stream, uint64_t arg_data_ptr, + uint64_t arg_sizes_ptr, uint32_t num_args) { + cuda_shim_launch_packed(module_handle, kernel_name_ptr, 1, 1, 1, blockX, + blockY, blockZ, 0, stream, arg_data_ptr, + arg_sizes_ptr, num_args); +} + +// Optional: global sync (avoid in async pipeline; prefer event/stream sync) +extern "C" void cuda_shim_ctx_synchronize(void) { mgpuCtxSynchronize(); } + +// only for debugging +extern "C" void cuda_debug_dump_float(uint64_t dptr, int n) { + auto *p = reinterpret_cast(static_cast(dptr)); + for (uint32_t i = 0; i < n; ++i) { + fprintf(stderr, "i=%u v=%f\n", i, p[i]); + } +} + +// #endif diff --git a/mlir/cuda-tile/cuda_shim/load_ptx_main.cpp b/mlir/cuda-tile/cuda_shim/load_ptx_main.cpp new file mode 100644 index 0000000..e641762 --- /dev/null +++ b/mlir/cuda-tile/cuda_shim/load_ptx_main.cpp @@ -0,0 +1,187 @@ +// Minimal demo showing how to load a PTX file and launch a kernel via the cuda_shim API. +// 1) Build PTX for Ada (RTX 4090) for the sample kernel in vector_add.cu: +// nvcc -std=c++17 -arch=sm_89 -ptx vector_add.cu -o vector_add.ptx +// 2) Build this runner together with the shim (nvcc handles the CUDA driver link flags): +// nvcc -std=c++17 load_ptx_main.cpp cuda_shim.cc -o load_ptx_demo -lcuda -lcudart +// 3) Run: ./load_ptx_demo vector_add.ptx vector_add 1048576 + +// nvcc -std=c++17 --cudart static load_ptx_main.cpp cuda_shim.cc -o load_ptx_demo -lcuda -lcudadevrt -lcudart_static -ldl -lrt -pthread +// g++-11 -std=c++17 load_ptx_main.cpp cuda_shim.cc -I/usr/local/cuda/include -L/usr/lib/x86_64-linux-gnu -lcuda -ldl -pthread -o load_ptx_demo +#include + +#include +#include +#include +#include +#include +#include +#include + +// The shim has no public header, so we redeclare the extern "C" hooks we need. +extern "C" uint64_t cuda_shim_load_module_from_image(uint64_t image_ptr, + uint64_t image_nbytes); +extern "C" uint64_t cuda_shim_load_module_from_file(uint64_t file_path_ptr, + uint64_t file_path_nbytes); +extern "C" void cuda_shim_unload_module(uint64_t module_handle); +extern "C" uint64_t cuda_shim_malloc(uint64_t nbytes, uint64_t stream, + bool is_host_shared); +extern "C" void cuda_shim_free(uint64_t dptr, uint64_t stream); +extern "C" void cuda_shim_memcpy_h2d(uint64_t dst_dptr, uint64_t src_hptr, + uint64_t nbytes); +extern "C" void cuda_shim_memcpy_d2h(uint64_t dst_hptr, uint64_t src_dptr, + uint64_t nbytes); +extern "C" uint64_t cuda_shim_stream_create(void); +extern "C" void cuda_shim_stream_destroy(uint64_t stream); +extern "C" void cuda_shim_stream_synchronize(uint64_t stream); +extern "C" void cuda_shim_launch_packed(uint64_t module_handle, + uint64_t kernel_name_ptr, + uint32_t gridX, uint32_t gridY, + uint32_t gridZ, uint32_t blockX, + uint32_t blockY, uint32_t blockZ, + uint32_t sharedMemBytes, + uint64_t stream, + uint64_t arg_data_ptr, + uint64_t arg_sizes_ptr, + uint32_t num_args); + +namespace { + +// Round up to next multiple of 8 to match cuda_shim_launch_packed's alignment. +size_t align8(size_t value) { return (value + 7) & ~static_cast(7); } + +// Load an entire file into a byte buffer. +bool loadFile(const std::string &path, std::vector &buffer) { + std::ifstream file(path, std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open PTX file: " << path << "\n"; + return false; + } + file.seekg(0, std::ios::end); + const auto size = static_cast(file.tellg()); + file.seekg(0, std::ios::beg); + buffer.resize(size); + file.read(buffer.data(), buffer.size()); + return true; +} + +// Append a trivially copyable argument into the packed arg buffer. +template +void appendArg(std::vector &argData, std::vector &argSizes, + const T &value) { + const size_t aligned = align8(argData.size()); + if (aligned > argData.size()) { + argData.resize(aligned, 0); + } + const uint8_t *ptr = reinterpret_cast(&value); + argData.insert(argData.end(), ptr, ptr + sizeof(T)); + argSizes.push_back(static_cast(sizeof(T))); +} + +} // namespace + +int main(int argc, char **argv) { + if (argc < 2) { + std::cerr + << "Usage: " << argv[0] + << " [kernel_name=vector_add] [num_elements=1048576]\n"; + return 1; + } + + const std::string ptxPath = argv[1]; + const std::string kernelName = (argc >= 3) ? argv[2] : std::string("vector_add"); + const int numElems = (argc >= 4) ? std::atoi(argv[3]) : (1 << 20); + const size_t numBytes = static_cast(numElems) * sizeof(float); + + // std::vector ptx; + // if (!loadFile(ptxPath, ptx)) { + // return 1; + // } + + // Load module from the PTX blob. + const uint64_t module_handle_for_launch = + cuda_shim_load_module_from_file( + reinterpret_cast(ptxPath.data()), + static_cast(ptxPath.size())); + if (module_handle_for_launch == 0) { + std::cerr << "Failed to load module from PTX: " << ptxPath << "\n"; + return 1; + } + + // cuda_shim_launch_packed expects a pointer to a CUmodule stored in host + // memory. Keep a stack copy and pass its address to satisfy that ABI. + // CUmodule module = reinterpret_cast(module_handle_raw); + // const uint64_t module_handle_for_launch = + // reinterpret_cast(&module); + + const uint64_t stream = cuda_shim_stream_create(); + + // Allocate device buffers. + const uint64_t dOut = cuda_shim_malloc(numBytes, stream, /*is_host_shared=*/false); + const uint64_t dA = cuda_shim_malloc(numBytes, stream, /*is_host_shared=*/false); + const uint64_t dB = cuda_shim_malloc(numBytes, stream, /*is_host_shared=*/false); + + std::vector hA(numElems); + std::vector hB(numElems); + std::vector hOut(numElems, 0.0f); + for (int i = 0; i < numElems; ++i) { + hA[i] = static_cast(i) * 0.5f; + hB[i] = static_cast(i) * 1.5f; + } + + cuda_shim_memcpy_h2d(dA, reinterpret_cast(hA.data()), numBytes); + cuda_shim_memcpy_h2d(dB, reinterpret_cast(hB.data()), numBytes); + + // Pack kernel arguments: (float* out, const float* a, const float* b, int n) + std::vector argData; + std::vector argSizes; + const uint64_t argOut = dOut; + const uint64_t argA = dA; + const uint64_t argB = dB; + const int argN = numElems; + + appendArg(argData, argSizes, argA); + appendArg(argData, argSizes, argB); + appendArg(argData, argSizes, argOut); + appendArg(argData, argSizes, argN); + + const uint32_t blockX = 256; + const uint32_t gridX = static_cast((numElems + blockX - 1) / blockX); + + cuda_shim_launch_packed( + module_handle_for_launch, + reinterpret_cast(kernelName.c_str()), + gridX, 1, 1, + blockX, 1, 1, + /*sharedMemBytes=*/0, + stream, + reinterpret_cast(argData.data()), + reinterpret_cast(argSizes.data()), + static_cast(argSizes.size())); + + cuda_shim_stream_synchronize(stream); + + cuda_shim_memcpy_d2h(reinterpret_cast(hOut.data()), dOut, numBytes); + + // Quick correctness check. + bool ok = true; + for (int i = 0; i < numElems; ++i) { + const float expect = hA[i] + hB[i]; + if (std::abs(hOut[i] - expect) > 1e-5f) { + std::cerr << "Mismatch at index " << i << ": got " << hOut[i] + << ", expected " << expect << "\n"; + ok = false; + break; + } + } + + std::cout << (ok ? "Success" : "Failure") << " for " << numElems + << " elements" << std::endl; + + cuda_shim_free(dOut, stream); + cuda_shim_free(dA, stream); + cuda_shim_free(dB, stream); + cuda_shim_stream_destroy(stream); + cuda_shim_unload_module(module_handle_for_launch); + + return ok ? 0 : 1; +} diff --git a/mlir/cuda-tile/cuda_shim/outlined_gpu_kernel.cu b/mlir/cuda-tile/cuda_shim/outlined_gpu_kernel.cu new file mode 100644 index 0000000..08f9e0f --- /dev/null +++ b/mlir/cuda-tile/cuda_shim/outlined_gpu_kernel.cu @@ -0,0 +1,21 @@ +// please run inside `nvidia/cuda:12.4.1-devel-ubuntu22.04` container if you +// want to use the 4090 RTX GPU with 12.4<= cuda <= 13.0. +// pelase compile with the command: +// nvcc -std=c++17 -arch=sm_89 -cubin outlined_gpu_kernel.cu -o cuda_tile.cubin +#include +#include + +extern "C" __global__ void outlined_gpu_kernel_0(const float* a0, const float* a1, + const float* a2, float* out) { + // 2x4 = 8 elements, row-major with stride (4,1) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= 8) return; + + // out[tid] = a0[tid]*a1[tid] + a2[tid]*a1[tid] + float x0 = a0[tid]; + float x1 = a1[tid]; + float x2 = a2[tid]; + out[tid] = x0 * x1 + x2 * x1; + // 等价:out[tid] = (x0 + x2) * x1; +} +// A = [[1,2,3,9],[4,5,6,10]] B = [[11,12,13,114],[15,16,17,18]] diff --git a/mlir/cuda-tile/cuda_shim/vector_add.cu b/mlir/cuda-tile/cuda_shim/vector_add.cu new file mode 100644 index 0000000..7973dd2 --- /dev/null +++ b/mlir/cuda-tile/cuda_shim/vector_add.cu @@ -0,0 +1,9 @@ +// Simple vector add kernel for PTX generation targeting Ada (RTX 4090). +// nvcc -std=c++17 -arch=sm_89 -ptx vector_add.cu -o vector_add.ptx +extern "C" __global__ void vector_add(const float *a, const float *b, + float *out, int n) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + out[idx] = a[idx] + b[idx]; + } +} diff --git a/mlir/cuda-tile/explore/example-nvvm.mlir b/mlir/cuda-tile/explore/example-nvvm.mlir new file mode 100644 index 0000000..39b9883 --- /dev/null +++ b/mlir/cuda-tile/explore/example-nvvm.mlir @@ -0,0 +1,60 @@ +module attributes {gpu.container_module} { + llvm.func @free(!llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + gpu.binary @kernels [#gpu.object<#nvvm.target, properties = {ISAToBinaryTimeInMs = 8 : i64, LLVMIRToISATimeInMs = 6 : i64}, "P\EDU\BA\01\00\10\00`\18\00\00\00\00\00\00\02\00\01\01h\00\00\00@\16\00\00\00\00\00\00\00\00\00\00@\00\00\00\08\00\01\00x\00\00\00\00\00\00\00\00\00\00\00\11\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00H\00\00\00\1C\00\00\00\00\02\09\00\00\02\02\01\00\03\07\01\01\02\03\00\00\04\0B\08\00P\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\01A\08\00\00\00\00\00\00\00\02\00\BE\00\01\00\00\00\00\00\00\00\00\00\00\00(\15\00\00\00\00\00\00\A8\0F\00\00\00\00\00\00\02x\00\06@\008\00\05\00@\00\16\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.note.nv.tkinfo\00.note.nv.cuinfo\00.nv.info\00.nv.compat\00.text.kernel\00.nv.info.kernel\00.nv.shared.kernel\00.nv.shared.reserved.0\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.constant0.kernel\00.nv.capmerc.text.kernel\00.nv.merc.debug_frame\00.nv.merc.nv.info\00.nv.merc.nv.info.kernel\00.nv.merc.rela.debug_frame\00.nv.merc.nv.shared.reserved.0\00.nv.merc.symtab\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.note.nv.tkinfo\00.note.nv.cuinfo\00.nv.info\00.nv.compat\00.text.kernel\00.nv.info.kernel\00.nv.shared.kernel\00.nv.reservedSmem.offset0\00.nv.shared.reserved.0\00__nv_reservedSMEM_offset_0_alias\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00kernel\00.nv.constant0.kernel\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\03\00\05\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\009\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00]\00\00\00\03\00\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\8C\00\00\00!\00\00\00@\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\BB\00\00\00 \A0\0D\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\00\00\00\03\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0C\01\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00(\01\00\00\12\10\0C\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00/\01\00\00\03\00\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\FF\FF\FF\FF$\00\00\00\00\00\00\00\FF\FF\FF\FF\FF\FF\FF\FF\03\00\04|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF,\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\04\18\00\00\00\0C\81\80\80(\00\04 \00\00\00\00\00\00\00\0C\00\00\00\8C\00\00\00\D0\07\00\00NVIDIA Corp\00\02\00\00\00\00\00\00\00\01\00\00\00\07\00\00\006\00\00\00`\00\00\00\00ptxas\00Cuda compilation tools, release 13.1, V13.1.80\00Build cuda_13.1.r13.1/compiler.36836380_0\00-O 3 -arch sm_120 \00\00\0C\00\00\00\08\00\00\00\E8\03\00\00NVIDIA Corp\00\02\00x\00\83\00\00\00\04/\08\00\08\00\00\00\0A\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\02\09\00\00\02\02\01\00\03\07\01\01\02\03\00\00\04\0B\08\00P\00\00\00\00\00\00\00\047\04\00\83\00\00\00\04\17\0C\00\00\00\00\00\0A\00P\00\00\F0!\00\04\17\0C\00\00\00\00\00\09\00H\00\00\F0!\00\04\17\0C\00\00\00\00\00\08\00@\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\008\00\00\F4!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F4!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F4!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F4!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03P\00\00\03\1B\FF\00\03_\01\01\02J\00\00\04\1C\08\00P\00\00\00\E0\00\00\00\03\19X\00\04\0A\08\00\09\00\00\00\80\03X\00\046\04\00\00\00\00\00\00\00\00\00\FF\FF\FF\FF\00\00\00\00\FE\FF\FF\FF\00\00\00\00\FD\FF\FF\FF\00\00\00\00\FC\FF\FF\FFD\00\00\00\00\00\00\00\02\00\00\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\82{\01\FF\00\DF\00\00\00\08\00\00\00\E2\0F\00\19y\02\00\00\00\00\00\00!\00\00\00\22\0E\00\ACw\04\FF\00p\00\00\00\0A\00\08\00\22\0E\001t\03\FF\00\00\00\00\FF\01\00\00\00\CA\0F\00\0C|\00\02\04\00\00\00pb\F1\0B\00\DC\1F\00M\09\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00\82{\04\FF\00\E4\00\00\00\0A\00\00\00\22\0E\00\ACw\04\FF\00k\00\00\00\0A\00\08\00n\0E\00\82{\06\FF\00\EE\00\00\00\0A\00\00\00\A2\0E\00%x\04\02\04\00\00\00\04\00\8E\07\00\CC\1F\00\81y\04\04\04\00\00\00\00\19\1E\0C\00\E2.\00%x\02\02\04\00\00\00\06\00\8E\07\00\C8O\00!r\07\04\04\00\00\00\00\00\00\00\00\CA\8F\00\86y\00\02\07\00\00\00\04\19\10\0C\00\E2\0F\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\FC\00\FC\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0C\00\00\00\01\00\00\C0\10\00\00\00(^\00\00\01\0B\04\0A\F8\00\04\00\00\00A\00\00\04\00\00\01\0B\04\0A\F8\00\04\00\00\00\81\00\01\02\00\00\02\22\08\06\FA\00R\00\00\00\03\01@\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00A\10v\0A\02\22\0E\06\F8\00R\00\00\00\03\01@\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\10\00\00\00\01\0B\0E\0A\FA\00\05\00\00\00\03\019\04\00\00\02\22\0E\06\F8\00R\00\00\00\83\01@\00\02\00\00\00\00\00\00\00\00\00\00\00\00\008\00\00\00\028\0E2\F8\00@\11\00\00\00\00\82\00\0A\00\00\02\01\C0\01\00\00\00\00\00\00\00\00\00\00\00\D0\04\FF\FF\FF\FF$\00\00\00\00\00\00\00\FF\FF\FF\FF\FF\FF\FF\FF\03\00\01|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00p\01\00\00\00\00\00\00\04\10\00\00\00\04p\00\00\00\0C\81\80\80(\00\04\E0\00\00\00\00\00\00\00\00\00\00\00\00\04/\08\00\08\00\00\00\0A\00\00\00\04\11\08\00\08\00\00\00\00\00\00\00\04\12\08\00\08\00\00\00\00\00\00\00\047\04\00\83\00\00\00\04Z \00\8A\9D\22\A4\B1\9D\14m\00\B4*\F3\F7X\03\A5',!0\C9\1E\C7\8F\0F\0CIl\0A/\00\00\04\17\0C\00\00\00\00\00\0A\00P\00\00\F0!\00\04\17\0C\00\00\00\00\00\09\00H\00\00\F0!\00\04\17\0C\00\00\00\00\00\08\00@\00\00\F0!\00\04\17\0C\00\00\00\00\00\07\008\00\00\F4!\00\04\17\0C\00\00\00\00\00\06\000\00\00\F4!\00\04\17\0C\00\00\00\00\00\05\00(\00\00\F0!\00\04\17\0C\00\00\00\00\00\04\00 \00\00\F0!\00\04\17\0C\00\00\00\00\00\03\00\18\00\00\F0!\00\04\17\0C\00\00\00\00\00\02\00\10\00\00\F4!\00\04\17\0C\00\00\00\00\00\01\00\08\00\00\F4!\00\04\17\0C\00\00\00\00\00\00\00\00\00\00\F0!\00\03P\00\00\03\1B\FF\00\03_\01\01\02J\00\00\04\1C\08\00p\00\00\00`\01\00\00\00\00\00\00D\00\00\00\00\00\00\00=\00\01\00\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\03\00\05\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\009\00\00\00\03\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00]\00\00\00\03\00\0F\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\8C\00\00\00!\00\00\00\00\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\BB\00\00\00 \A0\14\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\DC\00\00\00\03\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0C\01\00\00\03\00\0A\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00(\01\00\00\12\10\0F\00\00\00\00\00\00\00\00\00p\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\00\00\00\00\A1\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\0B\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\05\02\00\00\00\00\00\00D\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\13\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00P\03\00\00\00\00\00\00\F0\00\00\00\00\00\00\00\02\00\00\00\0A\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\A2\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\04\00\00\00\00\00\00h\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\07\00\00\00\00\00\00\02\00\00\00\00\00\00\00\00\00\00\00\00\A8\04\00\00\00\00\00\00\A4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\009\00\00\00\07\00\00\00@\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00L\05\00\00\00\00\00\00 \00\00\00\00\00\00\00\05\00\00\00\08\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00I\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00l\05\00\00\00\00\00\00$\00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00R\00\00\00\86\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\90\05\00\00\00\00\00\00\1C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00j\00\00\00\00\00\00p@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\AC\05\00\00\00\00\00\00\EC\00\00\00\00\00\00\00\03\00\00\00\0C\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D2\00\00\00\01\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\98\06\00\00\00\00\00\00 \00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\C0\00\00\00\04\00\00\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B8\06\00\00\00\00\00\00\18\00\00\00\00\00\00\00\03\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00]\00\00\00\01\00\00\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\00\00\00\00\00\80\01\00\00\00\00\00\00\03\00\00\00\08\00\00\00\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\8C\00\00\00\08\00\00\00\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\08\00\00\00\00\00\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\EE\00\00\00\01\00\00\00B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\08\00\00\00\00\00\00\D8\03\00\00\00\00\00\00\00\00\00\00\0C\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\03\01\00\00\16\00\00p\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00`\0C\00\00\00\00\00\00\C6\00\00\00\00\00\00\00\15\00\00\00\08\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\1B\01\00\00\01\00\00\00\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00&\0D\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\01\00\00\83\00\00p\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\98\0D\00\00\00\00\00\00$\00\00\00\00\00\00\00\15\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00A\01\00\00\83\00\00p@\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\BC\0D\00\00\00\00\00\00\F8\00\00\00\00\00\00\00\15\00\00\00\0F\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00Y\01\00\00\82\00\00p@\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\B8\0E\00\00\00\00\00\00\18\00\00\00\00\00\00\00\15\00\00\00\10\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00s\01\00\00\15\00\00p\03\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\D0\0E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\91\01\00\00\85\00\00p\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\D0\0E\00\00\00\00\00\00\D8\00\00\00\00\00\00\00\02\00\00\00\08\00\00\00\08\00\00\00\00\00\00\00\18\00\00\00\00\00\00\00\06\00\00\00\04\00\00\00(\15\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\18\01\00\00\00\00\00\00\18\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\04\00\00\00(\15\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\18\01\00\00\00\00\00\00\18\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\00\07\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\80\01\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\06\00\00\00\80\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00@\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\04\00\00\00\80\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\03\00\00\00\00\00\00\D8\03\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\01\01P\00\00\00h\01\00\00\00\00\00\00d\01\00\00@\00\00\00\07\00\08\00x\00\00\00\00\00\00\00\00\00\00\00\11\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\91\03\00\00\00\00\00\00H\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00(\B5/\FD`\91\02\D5\0A\00\A6\D18\22Pk\F3\D86\03\EC\F9E\03\AF-\B3\B4\09\B9I\0F\E0(\8C\15\AD\B6Fm\EC\A1\C4\A3j0RG-\00/\00/\00\BD\02\D2\A3\F7\05v\F8\1Ay\B3\CB'\D2\C8\1A\F6\87H\1D\A5\FE\10\BE^e)\E7\A0\\\1D\FAW{\9DNp\F4\88\F4=\D2\BB\BB\01ky\DAb)f\C2\D0\7F6C\FE\BD\10;\B0\8A\22H8\0D\C3\96\A5\14GP\94\AFY\BE\E7!\A1\E6\AF\B3\93\E2\1Ck \CD\8E\F7o\06\EF\1FV31\D86i\BB\ECY\9DnK\0C\D6H\C2\B6\86\86\B1&\9C\14\A4#\0F\C7\B6\86\82\C0i\81r\B7-\12\0A\F8\09\B3\86)>\E1\00\1B\B1\DB\AA\FF\16l[\1A\AA\D2mK\A1\A3G\E0\B4\A0dV\E3\C0\87\F0mk8\0E\E3\D48\8C\0C-!\8Bi\CB)\0C\86s\07\01\1C4 \A0Bt\E6\01wj\F8\81\B3A\C0\B3~\B6\08'`yIJ\824,\19\A5\0C\9E\CB\98\18.iw\BC\CC\B2\D1*\0F;\84\0D\B9S\131o\D7\03\C2\166Wf\8B]l\1ElzLqp^Z\8F\9AR\D0\9E\90Cy\02l\0CEoN\91\0DCy\867P\C8\AC\8B\C4\04\D1\09\19\A1|\80\F2Y\C4V\E4\FB\9C\AB^\913?\98\03\83\02\00\00\00\00">] + llvm.func @main() { + %0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %1 = llvm.mlir.zero : !llvm.ptr + %2 = llvm.mlir.constant(6 : index) : i64 + %3 = llvm.mlir.constant(0 : index) : i64 + %4 = llvm.mlir.constant(1.000000e+00 : f32) : f32 + %5 = llvm.mlir.constant(2.000000e+00 : f32) : f32 + %6 = llvm.mlir.constant(3.000000e+00 : f32) : f32 + %7 = llvm.mlir.constant(4.000000e+00 : f32) : f32 + %8 = llvm.mlir.constant(5.000000e+00 : f32) : f32 + %9 = llvm.mlir.constant(6.000000e+00 : f32) : f32 + %10 = llvm.mlir.constant(1 : index) : i64 + %11 = llvm.mlir.constant(2 : index) : i64 + %12 = llvm.getelementptr %1[6] : (!llvm.ptr) -> !llvm.ptr, f32 + %13 = llvm.ptrtoint %12 : !llvm.ptr to i64 + %14 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr + %15 = llvm.insertvalue %14, %0[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %16 = llvm.insertvalue %14, %15[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %17 = llvm.insertvalue %3, %16[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %18 = llvm.insertvalue %2, %17[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %19 = llvm.insertvalue %10, %18[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + %20 = builtin.unrealized_conversion_cast %19 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<6xf32> + llvm.store %4, %14 : f32, !llvm.ptr + %21 = llvm.getelementptr inbounds|nuw %14[1] : (!llvm.ptr) -> !llvm.ptr, f32 + llvm.store %5, %21 : f32, !llvm.ptr + %22 = llvm.getelementptr inbounds|nuw %14[2] : (!llvm.ptr) -> !llvm.ptr, f32 + llvm.store %6, %22 : f32, !llvm.ptr + %23 = llvm.getelementptr inbounds|nuw %14[3] : (!llvm.ptr) -> !llvm.ptr, f32 + llvm.store %7, %23 : f32, !llvm.ptr + %24 = llvm.getelementptr inbounds|nuw %14[4] : (!llvm.ptr) -> !llvm.ptr, f32 + llvm.store %8, %24 : f32, !llvm.ptr + %25 = llvm.getelementptr inbounds|nuw %14[5] : (!llvm.ptr) -> !llvm.ptr, f32 + llvm.store %9, %25 : f32, !llvm.ptr + %memref = gpu.alloc () : memref<6xf32, 1> + %26 = builtin.unrealized_conversion_cast %memref : memref<6xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %memref_0 = gpu.alloc () : memref<6xf32, 1> + %27 = builtin.unrealized_conversion_cast %memref_0 : memref<6xf32, 1> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + gpu.memcpy %memref, %20 : memref<6xf32, 1>, memref<6xf32> + %28 = llvm.extractvalue %26[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %29 = llvm.extractvalue %26[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %30 = llvm.extractvalue %26[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %31 = llvm.extractvalue %26[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %32 = llvm.extractvalue %26[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %33 = llvm.extractvalue %27[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %34 = llvm.extractvalue %27[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %35 = llvm.extractvalue %27[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %36 = llvm.extractvalue %27[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + %37 = llvm.extractvalue %27[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<1 x i64>, array<1 x i64>)> + gpu.launch_func @kernels::@kernel blocks in (%10, %10, %10) threads in (%11, %10, %10) : i64 args(%11 : i64, %28 : !llvm.ptr<1>, %29 : !llvm.ptr<1>, %30 : i64, %31 : i64, %32 : i64, %33 : !llvm.ptr<1>, %34 : !llvm.ptr<1>, %35 : i64, %36 : i64, %37 : i64) + gpu.dealloc %memref : memref<6xf32, 1> + gpu.dealloc %memref_0 : memref<6xf32, 1> + llvm.call @free(%14) : (!llvm.ptr) -> () + llvm.return + } +} + diff --git a/mlir/cuda-tile/explore/extern_fun.mlir b/mlir/cuda-tile/explore/extern_fun.mlir new file mode 100644 index 0000000..0871803 --- /dev/null +++ b/mlir/cuda-tile/explore/extern_fun.mlir @@ -0,0 +1,113 @@ +module { + // libc + func.func private @malloc(i64) -> memref<*xi8> + func.func private @free(memref<*xi8>) + + // 轻量包装:仅用整数/布尔/opaque memref,避免 llvm.ptr 类型 + func.func private @shimMemAlloc(i64) -> i64 + func.func private @shimMemFree(i64) + func.func private @shimMemcpyHtoD(i64, memref<6xf32>) + func.func private @shimMemcpyDtoH(memref<6xf32>, i64) + func.func private @shimCtxSynchronize() + + func.func @main() { + %size_bytes = arith.constant 24 : i64 // 6 * sizeof(f32) + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %f1 = arith.constant 1.0 : f32 + %f2 = arith.constant 2.0 : f32 + %f3 = arith.constant 3.0 : f32 + %f4 = arith.constant 4.0 : f32 + %f5 = arith.constant 5.0 : f32 + %f6 = arith.constant 6.0 : f32 + + // host buffer as memref + %h = memref.alloc() : memref<6xf32> + memref.store %f1, %h[%c0] : memref<6xf32> + memref.store %f2, %h[%c1] : memref<6xf32> + memref.store %f3, %h[%c2] : memref<6xf32> + memref.store %f4, %h[%c3] : memref<6xf32> + memref.store %f5, %h[%c4] : memref<6xf32> + memref.store %f6, %h[%c5] : memref<6xf32> + + // device alloc handle (as i64 pointer-sized integer) + %d = func.call @shimMemAlloc(%size_bytes) : (i64) -> i64 + + func.call @shimMemcpyHtoD(%d, %h) : (i64, memref<6xf32>) -> () + func.call @shimCtxSynchronize() : () -> () + func.call @shimMemcpyDtoH(%h, %d) : (memref<6xf32>, i64) -> () + func.call @shimCtxSynchronize() : () -> () + + func.call @shimMemFree(%d) : (i64) -> () + memref.dealloc %h : memref<6xf32> + func.return + } +} + +// module { +// // libc +// llvm.func @malloc(i64) -> !llvm.ptr +// llvm.func @free(!llvm.ptr) + +// // cuda_shim C 接口(来自 cuda_shim.cpp) +// llvm.func @mgpuMemAlloc(i64, !llvm.ptr, i1) -> !llvm.ptr +// llvm.func @mgpuMemFree(!llvm.ptr, !llvm.ptr) +// llvm.func @mgpuMemcpyHtoD(!llvm.ptr, !llvm.ptr, i64) +// llvm.func @mgpuMemcpyDtoH(!llvm.ptr, !llvm.ptr, i64) +// llvm.func @mgpuCtxSynchronize() + +// llvm.func @main() { +// %size = llvm.mlir.constant(24 : i64) : i64 // 6 * sizeof(f32) +// %zero_ptr = llvm.mlir.zero : !llvm.ptr // 空 stream +// %false = llvm.mlir.constant(false) : i1 + +// // host buffer +// %h = llvm.call @malloc(%size) : (i64) -> !llvm.ptr + +// // 写入 1..6 到 host +// %c0 = llvm.mlir.constant(0 : index) : i64 +// %c1 = llvm.mlir.constant(1 : index) : i64 +// %c2 = llvm.mlir.constant(2 : index) : i64 +// %c3 = llvm.mlir.constant(3 : index) : i64 +// %c4 = llvm.mlir.constant(4 : index) : i64 +// %c5 = llvm.mlir.constant(5 : index) : i64 +// %f1 = llvm.mlir.constant(1.0 : f32) : f32 +// %f2 = llvm.mlir.constant(2.0 : f32) : f32 +// %f3 = llvm.mlir.constant(3.0 : f32) : f32 +// %f4 = llvm.mlir.constant(4.0 : f32) : f32 +// %f5 = llvm.mlir.constant(5.0 : f32) : f32 +// %f6 = llvm.mlir.constant(6.0 : f32) : f32 + +// %p0 = llvm.getelementptr %h[%c0] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +// llvm.store %f1, %p0 : f32, !llvm.ptr +// %p1 = llvm.getelementptr %h[%c1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +// llvm.store %f2, %p1 : f32, !llvm.ptr +// %p2 = llvm.getelementptr %h[%c2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +// llvm.store %f3, %p2 : f32, !llvm.ptr +// %p3 = llvm.getelementptr %h[%c3] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +// llvm.store %f4, %p3 : f32, !llvm.ptr +// %p4 = llvm.getelementptr %h[%c4] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +// llvm.store %f5, %p4 : f32, !llvm.ptr +// %p5 = llvm.getelementptr %h[%c5] : (!llvm.ptr, i64) -> !llvm.ptr, f32 +// llvm.store %f6, %p5 : f32, !llvm.ptr + +// // device alloc (isHostShared = false) +// %d = llvm.call @mgpuMemAlloc(%size, %zero_ptr, %false) +// : (i64, !llvm.ptr, i1) -> !llvm.ptr + +// // HtoD then DtoH (round-trip) +// llvm.call @mgpuMemcpyHtoD(%d, %h, %size) : (!llvm.ptr, !llvm.ptr, i64) -> () +// llvm.call @mgpuCtxSynchronize() : () -> () +// llvm.call @mgpuMemcpyDtoH(%h, %d, %size) : (!llvm.ptr, !llvm.ptr, i64) -> () +// llvm.call @mgpuCtxSynchronize() : () -> () + +// // free +// llvm.call @mgpuMemFree(%d, %zero_ptr) : (!llvm.ptr, !llvm.ptr) -> () +// llvm.call @free(%h) : (!llvm.ptr) -> () +// llvm.return +// } +// } \ No newline at end of file diff --git a/mlir/cuda-tile/explore/gpu.mlir b/mlir/cuda-tile/explore/gpu.mlir new file mode 100644 index 0000000..bcef914 --- /dev/null +++ b/mlir/cuda-tile/explore/gpu.mlir @@ -0,0 +1,99 @@ +// module attributes {gpu.container_module} { +// // ---- Device side (GPU) ---- +// gpu.module @kernels { +// gpu.func @kernel(%n : index, %A : memref, %B : memref) +// attributes { gpu.kernel } { +// %tid = gpu.thread_id x +// %pred = arith.cmpi slt, %tid, %n : index +// scf.if %pred { +// %a = memref.load %A[%tid] : memref +// memref.store %a, %B[%tid] : memref +// } +// gpu.return +// } +// } + +// // ---- Host side (CPU) ---- +// func.func @main(%n : index, %hA : memref, %hB : memref) { +// %dA = gpu.alloc(%n) : memref +// %dB = gpu.alloc(%n) : memref +// gpu.memcpy %dA, %hA : memref, memref +// // launch kernel(blocks/threads 这里先写死成 1D) +// %c1 = arith.constant 1 : index +// gpu.launch_func @kernels::@kernel +// blocks in (%c1, %c1, %c1) threads in (%n, %c1, %c1) +// args(%n : index, %dA : memref, %dB : memref) + +// gpu.memcpy %hB, %dB : memref, memref +// gpu.dealloc %dA : memref +// gpu.dealloc %dB : memref +// return +// } +// } + +module attributes {gpu.container_module} { + + gpu.module @kernels { + gpu.func @kernel(%n : index, %A : memref<6xf32, 1>, %B : memref<6xf32, 1>) + attributes { gpu.kernel } { + %tid = gpu.thread_id x + %pred = arith.cmpi slt, %tid, %n : index + scf.if %pred { + %a = memref.load %A[%tid] : memref<6xf32, 1> + %b = arith.addf %a, %a : f32 + memref.store %b, %B[%tid] : memref<6xf32, 1> + } + gpu.return + } + } + + func.func @main() { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 6.000000e+00 : f32 + %cst_0 = arith.constant 5.000000e+00 : f32 + %cst_1 = arith.constant 4.000000e+00 : f32 + %cst_2 = arith.constant 3.000000e+00 : f32 + %cst_3 = arith.constant 2.000000e+00 : f32 + %cst_4 = arith.constant 1.000000e+00 : f32 + + %0 = memref.alloc() : memref<6xf32> + %1 = memref.alloc() : memref<6xf32> + + affine.store %cst_4, %1[0] : memref<6xf32> + affine.store %cst_3, %1[1] : memref<6xf32> + affine.store %cst_2, %1[2] : memref<6xf32> + affine.store %cst_1, %1[3] : memref<6xf32> + affine.store %cst_0, %1[4] : memref<6xf32> + affine.store %cst, %1[5] : memref<6xf32> + + %n = arith.constant 2 : index + + %dA = gpu.alloc() : memref<6xf32, 1> + %dB = gpu.alloc() : memref<6xf32, 1> + gpu.memcpy %dA, %1 : memref<6xf32, 1>, memref<6xf32> + + // launch kernel(blocks/threads 这里先写死成 1D) + gpu.launch_func @kernels::@kernel + blocks in (%c1, %c1, %c1) threads in (%n, %c1, %c1) + args(%n : index, %dA : memref<6xf32, 1>, %dB : memref<6xf32, 1>) + + gpu.memcpy %0, %dB : memref<6xf32>, memref<6xf32, 1> + gpu.dealloc %dA : memref<6xf32, 1> + gpu.dealloc %dB : memref<6xf32, 1> + memref.dealloc %1 : memref<6xf32> + memref.dealloc %0 : memref<6xf32> + return + } +} +// func.func @main() { +// %c2 = arith.constant 2 : index +// %c1 = arith.constant 1 : index +// gpu.launch +// blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1) +// threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) { +// gpu.printf "Hello from %d\n", %6 : index +// gpu.terminator +// } +// return +// } \ No newline at end of file diff --git a/mlir/cuda-tile/explore/outlined.mlir b/mlir/cuda-tile/explore/outlined.mlir new file mode 100644 index 0000000..714eddf --- /dev/null +++ b/mlir/cuda-tile/explore/outlined.mlir @@ -0,0 +1,22 @@ +module { + toy.func @main() { + %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32> + %1 = toy.constant dense<[[1.100000e+01, 1.200000e+01, 1.300000e+01], [1.400000e+01, 1.500000e+01, 1.600000e+01]]> : tensor<2x3xf32> + %2 = toy.launch_gpu @outlined_gpu_kernel_0(%1, %0) {grid = array} : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x2xf32> + toy.print %2 : tensor<2x2xf32> + %3 = toy.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00], [1.000000e+01, 1.100000e+01, 1.200000e+01]]> : tensor<2x3xf32> + %4 = toy.launch_gpu @outlined_gpu_kernel_1(%0, %3, %1) {grid = array} : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32> + toy.print %4 : tensor<2x3xf32> + toy.return + } + toy.gpu_func @outlined_gpu_kernel_0(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<2x2xf32> { + %0 = toy.transpose(%arg0 : tensor<2x3xf32>) to tensor<3x2xf32> + %1 = toy.matmul(%arg1 : tensor<2x3xf32>, %0 : tensor<3x2xf32>) to tensor<2x2xf32> + toy.return %1 : tensor<2x2xf32> + } + toy.gpu_func @outlined_gpu_kernel_1(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf32>) -> tensor<2x3xf32> { + %0 = toy.mul %arg0, %arg1 : tensor<2x3xf32> + %1 = toy.add %0, %arg2 : tensor<2x3xf32> + toy.return %1 : tensor<2x3xf32> + } +} diff --git a/mlir/cuda-tile/explore/run.sh b/mlir/cuda-tile/explore/run.sh new file mode 100644 index 0000000..4caf90b --- /dev/null +++ b/mlir/cuda-tile/explore/run.sh @@ -0,0 +1,121 @@ +export MLIR_RUNNER_UTILS=`pwd`/../third_party/llvm/lib/libmlir_runner_utils.so +export MLIR_CUDA_RUNTIME=`pwd`/../third_party/llvm/lib/libmlir_cuda_runtime.so + +# Set this to your GPU arch, e.g. sm_120 for RTX 50xx (if your toolchain supports it). +export CUDA_ARCH=${CUDA_ARCH:-sm_120} + +rm -rf example-nvvm.mlir example.ll + + +../third_party/llvm/bin/mlir-opt gpu.mlir -cse \ + -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_120 opt-level=3" \ + --reconcile-unrealized-casts -cse -o example-nvvm.mlir + + +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# --pass-pipeline="builtin.module( +# nvvm-attach-target{chip=sm_80 O=3}, +# gpu.module(convert-gpu-to-nvvm), +# gpu-module-to-binary, +# lower-host-to-llvm +# )" \ +# -o example-nvvm.mlir + + +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 opt-level=3" \ +# -reconcile-unrealized-casts \ +# -canonicalize -cse \ +# -o example-nvvm.mlir + +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# --convert-scf-to-cf \ +# --convert-index-to-llvm \ +# --convert-arith-to-llvm \ +# --finalize-memref-to-llvm \ +# --convert-cf-to-llvm \ +# --convert-func-to-llvm \ +# --convert-to-llvm \ +# --reconcile-unrealized-casts \ +# -canonicalize -cse \ +# -o example-nvvm.mlir + + +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_120 opt-level=3" \ +# -reconcile-unrealized-casts \ +# -o example-nvvm.mlir + +# --gpu-to-llvm="use-bare-pointers-for-kernels=1 intersperse-sizes-for-kernels=1" +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# --pass-pipeline="builtin.module( +# nvvm-attach-target{chip=sm_89 O=3}, +# gpu.module(convert-gpu-to-nvvm), +# convert-scf-to-cf, +# convert-index-to-llvm, +# convert-arith-to-llvm, +# convert-math-to-llvm, +# convert-func-to-llvm, +# gpu-to-llvm, +# convert-cf-to-llvm, +# finalize-memref-to-llvm, +# gpu-module-to-binary, +# reconcile-unrealized-casts +# )" -o example-nvvm.mlir + + +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# --gpu-to-llvm + +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# --pass-pipeline="builtin.module( +# gpu-kernel-outlining, +# nvvm-attach-target{chip=sm_80 O=3}, +# gpu.module(convert-gpu-to-nvvm), +# gpu-module-to-binary, +# convert-scf-to-cf, +# convert-cf-to-llvm, +# lower-host-to-llvm, +# reconcile-unrealized-casts +# )" \ +# -o example-nvvm.mlir + +# ../third_party/llvm/bin/mlir-opt gpu.mlir \ +# --pass-pipeline="builtin.module( +# gpu-kernel-outlining, +# nvvm-attach-target{chip=sm_80 O=3}, +# gpu.module(convert-gpu-to-nvvm), +# gpu-module-to-binary, + +# gpu-to-llvm, + +# convert-scf-to-cf, +# convert-index-to-llvm, +# convert-arith-to-llvm, +# convert-memref-to-llvm, +# finalize-memref-to-llvm, +# convert-cf-to-llvm, +# convert-func-to-llvm, + +# reconcile-unrealized-casts +# )" \ +# -o example-nvvm.mlir + + +../third_party/llvm/bin/mlir-translate example-nvvm.mlir \ + --mlir-to-llvmir \ + -o example.ll + + # -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=${CUDA_ARCH}" + + # | ../third_party/llvm/bin/mlir-runner \ + # --shared-libs=$MLIR_CUDA_RUNTIME \ + # --shared-libs=$MLIR_RUNNER_UTILS \ + # --entry-point=_mlir_ciface_main \ + # --entry-point-result=void + +# ../third_party/llvm/bin/mlir-runner example-nvvm.mlir \ +# --entry-point-result=void \ +# --shared-libs=${MLIR_RUNNER_UTILS} \ +# --shared-libs=${MLIR_CUDA_RUNTIME} + diff --git a/mlir/cuda-tile/sample/cuda-tile.mlir b/mlir/cuda-tile/sample/cuda-tile.mlir new file mode 100644 index 0000000..72d424d --- /dev/null +++ b/mlir/cuda-tile/sample/cuda-tile.mlir @@ -0,0 +1,32 @@ +module { + toy.func @main() { + %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00, 9.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00, 1.000000e+01]]> : tensor<2x4xf32> + %1 = toy.constant dense<[[1.100000e+01, 1.200000e+01, 1.300000e+01, 1.400000e+01], [1.500000e+01, 1.600000e+01, 1.700000e+01, 1.800000e+01]]> : tensor<2x4xf32> + %2 = toy.constant dense<[[7.000000e+00, 8.000000e+00, 9.000000e+00, 1.300000e+01], [1.000000e+01, 1.100000e+01, 1.200000e+01, 1.400000e+01]]> : tensor<2x4xf32> + %3 = toy.launch_gpu @outlined_gpu_kernel_0(%0, %2, %1) {grid = array} : (tensor<2x4xf32>, tensor<2x4xf32>, tensor<2x4xf32>) -> tensor<2x4xf32> + toy.print %3 : tensor<2x4xf32> + toy.return + } + cuda_tile.module @cuda_tile_module { + entry @outlined_gpu_kernel_0(%arg0: tile>, %arg1: tile>, %arg2: tile>, %arg3: tile>) { + %tview = make_tensor_view %arg0, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]> + %pview = make_partition_view %tview : partition_view> + %cst_0_i32 = constant : tile + %tile, %result_token = load_view_tko weak %pview[%cst_0_i32, %cst_0_i32] : partition_view>, tile -> tile<2x4xf32>, token + %tview_0 = make_tensor_view %arg1, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]> + %pview_1 = make_partition_view %tview_0 : partition_view> + %tile_2, %result_token_3 = load_view_tko weak %pview_1[%cst_0_i32, %cst_0_i32] : partition_view>, tile -> tile<2x4xf32>, token + %0 = mulf %tile, %tile_2 : tile<2x4xf32> + %tview_4 = make_tensor_view %arg2, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]> + %pview_5 = make_partition_view %tview_4 : partition_view> + %tile_6, %result_token_7 = load_view_tko weak %pview_5[%cst_0_i32, %cst_0_i32] : partition_view>, tile -> tile<2x4xf32>, token + %tile_8, %result_token_9 = load_view_tko weak %pview_1[%cst_0_i32, %cst_0_i32] : partition_view>, tile -> tile<2x4xf32>, token + %1 = mulf %tile_6, %tile_8 : tile<2x4xf32> + %2 = addf %0, %1 : tile<2x4xf32> + %tview_10 = make_tensor_view %arg3, shape = [2, 4], strides = [4, 1] : tensor_view<2x4xf32, strides=[4,1]> + %pview_11 = make_partition_view %tview_10 : partition_view> + %3 = store_view_tko weak %2, %pview_11[%cst_0_i32, %cst_0_i32] : tile<2x4xf32>, partition_view>, tile -> token + return + } + } +} diff --git a/mlir/cuda-tile/sample/example.toy b/mlir/cuda-tile/sample/example.toy new file mode 100644 index 0000000..724a23e --- /dev/null +++ b/mlir/cuda-tile/sample/example.toy @@ -0,0 +1,13 @@ +def main() { + # Define a variable `a` with shape <2, 3>, initialized with the literal value. + # The shape is inferred from the supplied literal. + var a = [[1, 2, 3], [4, 5, 6]]; + + # b is identical to a, the literal tensor is implicitly reshaped: defining new + # variables is the way to reshape tensors (element count must match). + var b<2, 3> = [1, 2, 3, 4, 5, 6]; + + # transpose() and print() are the only builtin, the following will transpose + # a and b and perform an element-wise multiplication before printing the result. + print(transpose(a) * transpose(b)); +} diff --git a/mlir/cuda-tile/sample/gpu-func.mlir b/mlir/cuda-tile/sample/gpu-func.mlir new file mode 100644 index 0000000..f5280bd --- /dev/null +++ b/mlir/cuda-tile/sample/gpu-func.mlir @@ -0,0 +1,199 @@ +module { + // --- CUDA shim externs (ABI: all pointers/handles are i64) --- + func.func private @cuda_shim_load_module_from_image(i64, i64) -> i64 + func.func private @cuda_shim_load_module_from_file(i64, i64) -> i64 + func.func private @cuda_shim_unload_module(i64) -> () + func.func private @cuda_shim_stream_create() -> i64 + func.func private @cuda_shim_stream_destroy(i64) -> () + func.func private @cuda_shim_stream_synchronize(i64) -> () + func.func private @cuda_shim_malloc(i64, i64, i1) -> i64 + func.func private @cuda_shim_free(i64, i64) -> () + func.func private @cuda_shim_memcpy_h2d(i64, i64, i64) -> () + func.func private @cuda_shim_memcpy_d2h(i64, i64, i64) -> () + func.func private @cuda_shim_launch_packed( + i64, i64, + i32, i32, i32, + i32, i32, i32, + i32, + i64, + i64, i64, + i32) -> () + func.func private @cuda_debug_dump_float(i64, i32) -> () + + // // --- GPU blob embedded (placeholder bytes for "cuda_tile.cubin") --- + // memref.global "private" constant @cuda_blob : memref<16xi8> = dense< + // [99, 117, 100, 97, 95, 116, 105, 108, 101, 46, 99, 117, 98, 105, 110, 0] + // > : memref<16xi8> + + // // --- Kernel name as a C string (NUL-terminated) --- + // // 注意:如果 driver 侧用 name 查找函数,这个字符串必须以 0 结尾。 + // memref.global "private" constant @kname : memref<22xi8> = dense<[ + // 111,117,116,108,105,110,101,100,95,103,112,117,95,107,101,114,110,101,108,95,48,0 + // ]> : memref<22xi8> + + memref.global "private" constant @cuda_blob : memref<16xi8> = + dense<"0x637564615f74696c652e637562696e00"> + + memref.global "private" constant @kname : memref<22xi8> = + dense<"0x6f75746c696e65645f6770755f6b65726e656c5f3000"> + + func.func @main() { + // ---------- Host buffers (after bufferization) ---------- + %hA = memref.alloc() : memref<2x4xf32> + %hB = memref.alloc() : memref<2x4xf32> + %hOut = memref.alloc() : memref<2x4xf32> + + // Fill constants (为了示例直接用 store 展开;真实 pipeline 通常会从 memref.global copy) + // A + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4 = arith.constant 4 : index + %c5 = arith.constant 5 : index + %c6 = arith.constant 6 : index + %c7 = arith.constant 7 : index + + %cf1 = arith.constant 1.0 : f32 + %cf2 = arith.constant 2.0 : f32 + %cf3 = arith.constant 3.0 : f32 + %cf9 = arith.constant 9.0 : f32 + + %cf4 = arith.constant 4.0 : f32 + %cf5 = arith.constant 5.0 : f32 + %cf6 = arith.constant 6.0 : f32 + %cf10 = arith.constant 10.0 : f32 + + %cf11 = arith.constant 11.0 : f32 + %cf12 = arith.constant 12.0 : f32 + %cf13 = arith.constant 13.0 : f32 + %cf14 = arith.constant 14.0 : f32 + %cf15 = arith.constant 15.0 : f32 + %cf16 = arith.constant 16.0 : f32 + %cf17 = arith.constant 17.0 : f32 + %cf18 = arith.constant 18.0 : f32 + + // row0 + memref.store %cf1, %hA[%c0, %c0] : memref<2x4xf32> + memref.store %cf2, %hA[%c0, %c1] : memref<2x4xf32> + memref.store %cf3, %hA[%c0, %c2] : memref<2x4xf32> + memref.store %cf9, %hA[%c0, %c3] : memref<2x4xf32> + + // row1 + memref.store %cf4, %hA[%c1, %c0] : memref<2x4xf32> + memref.store %cf5, %hA[%c1, %c1] : memref<2x4xf32> + memref.store %cf6, %hA[%c1, %c2] : memref<2x4xf32> + memref.store %cf10, %hA[%c1, %c3] : memref<2x4xf32> + + // B = %1 in your original (这里假设 %1 是第二个输入;你原 op 里是 (%0, %2, %1),请按你 kernel 的真实语义对齐) + memref.store %cf11, %hB[%c0, %c0] : memref<2x4xf32> + memref.store %cf12, %hB[%c0, %c1] : memref<2x4xf32> + memref.store %cf13, %hB[%c0, %c2] : memref<2x4xf32> + memref.store %cf14, %hB[%c0, %c3] : memref<2x4xf32> + memref.store %cf15, %hB[%c1, %c0] : memref<2x4xf32> + memref.store %cf16, %hB[%c1, %c1] : memref<2x4xf32> + memref.store %cf17, %hB[%c1, %c2] : memref<2x4xf32> + memref.store %cf18, %hB[%c1, %c3] : memref<2x4xf32> + + // ---------- Load module ---------- + %blob = memref.get_global @cuda_blob : memref<16xi8> + %blob_ptr_idx = memref.extract_aligned_pointer_as_index %blob : memref<16xi8> -> index + %blob_ptr_i64 = arith.index_cast %blob_ptr_idx : index to i64 + %blobSize = arith.constant 16 : i64 + %mod = func.call @cuda_shim_load_module_from_file(%blob_ptr_i64, %blobSize) : (i64, i64) -> i64 + + // kernel name pointer + %kn = memref.get_global @kname : memref<22xi8> + %kname_ptr_idx = memref.extract_aligned_pointer_as_index %kn : memref<22xi8> -> index + %kname_ptr_i64 = arith.index_cast %kname_ptr_idx : index to i64 + + // ---------- Stream + device alloc ---------- + %stream = func.call @cuda_shim_stream_create() : () -> i64 + %isHostShared = arith.constant 0 : i1 + + %nElems = arith.constant 8 : i32 + %nbytes = arith.constant 32 : i64 // 2*4*f32 = 8 * 4 = 32 bytes + + %dA = func.call @cuda_shim_malloc(%nbytes, %stream, %isHostShared) : (i64, i64, i1) -> i64 + %dB = func.call @cuda_shim_malloc(%nbytes, %stream, %isHostShared) : (i64, i64, i1) -> i64 + %dOut = func.call @cuda_shim_malloc(%nbytes, %stream, %isHostShared) : (i64, i64, i1) -> i64 + + // host ptrs (as i64) + %hAptr = memref.extract_aligned_pointer_as_index %hA : memref<2x4xf32> -> index + %hBptr = memref.extract_aligned_pointer_as_index %hB : memref<2x4xf32> -> index + %hOutptr = memref.extract_aligned_pointer_as_index %hOut : memref<2x4xf32> -> index + + // host memrefs -> i64 + %hA_ptr_i64 = arith.index_cast %hAptr : index to i64 + %hB_ptr_i64 = arith.index_cast %hBptr : index to i64 + %hOut_ptr_i64 = arith.index_cast %hOutptr : index to i64 + + func.call @cuda_shim_memcpy_h2d(%dA, %hA_ptr_i64, %nbytes) : (i64, i64, i64) -> () + func.call @cuda_shim_memcpy_h2d(%dB, %hB_ptr_i64, %nbytes) : (i64, i64, i64) -> () + + // ---------- Build argSlots / argSizes (方案 A) ---------- + // 这里 num_args=4: (A, B, Out, N) + // 注意:参数顺序必须和 @outlined_gpu_kernel_0 的 PTX param_0.. 一致 + %numArgs = arith.constant 4 : index + %argSlots = memref.alloc() : memref<4xi64> + %argSizes = memref.alloc() : memref<4xi64> + %c8 = arith.constant 8 : i64 + %ci4 = arith.constant 4 : i64 + + // num_args = 4 + // i=0 a0 + memref.store %c8, %argSizes[%c0] : memref<4xi64> + memref.store %dA, %argSlots[%c0] : memref<4xi64> + + // i=1 a1 + memref.store %c8, %argSizes[%c1] : memref<4xi64> + memref.store %dB, %argSlots[%c1] : memref<4xi64> + + // i=2 a2 (你需要一个 dC,对应第三个输入) + memref.store %c8, %argSizes[%c2] : memref<4xi64> + memref.store %dB, %argSlots[%c2] : memref<4xi64> + + // i=3 out + memref.store %c8, %argSizes[%c3] : memref<4xi64> + memref.store %dOut, %argSlots[%c3] : memref<4xi64> + + // pointers to argSlots/argSizes (as i64) + %argSlotsptr = memref.extract_aligned_pointer_as_index %argSlots : memref<4xi64> -> index + %argSlots_ptr_i64 = arith.index_cast %argSlotsptr : index to i64 + %argSizesptr = memref.extract_aligned_pointer_as_index %argSizes : memref<4xi64> -> index + %argSizes_ptr_i64 = arith.index_cast %argSizesptr : index to i64 + + // ---------- Launch ---------- + %gridX = arith.constant 1 : i32 + %gridY = arith.constant 1 : i32 + %gridZ = arith.constant 1 : i32 + %blockX = arith.constant 8 : i32 + %blockY = arith.constant 1 : i32 + %blockZ = arith.constant 1 : i32 + %shmem = arith.constant 0 : i32 + %numArgsI32 = arith.constant 4 : i32 + + func.call @cuda_shim_launch_packed( + %mod, %kname_ptr_i64, + %gridX, %gridY, %gridZ, + %blockX, %blockY, %blockZ, + %shmem, %stream, + %argSlots_ptr_i64, %argSizes_ptr_i64, %numArgsI32 + ) : (i64, i64, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32) -> () + + func.call @cuda_shim_stream_synchronize(%stream) : (i64) -> () + func.call @cuda_shim_memcpy_d2h(%hOut_ptr_i64, %dOut, %nbytes) : (i64, i64, i64) -> () + + %ci8 = arith.constant 8 : i32 + func.call @cuda_debug_dump_float(%hOut_ptr_i64, %ci8) : (i64, i32) -> () + + // ---------- Cleanup ---------- + func.call @cuda_shim_free(%dOut, %stream) : (i64, i64) -> () + func.call @cuda_shim_free(%dA, %stream) : (i64, i64) -> () + func.call @cuda_shim_free(%dB, %stream) : (i64, i64) -> () + func.call @cuda_shim_stream_destroy(%stream) : (i64) -> () + func.call @cuda_shim_unload_module(%mod) : (i64) -> () + + return + } +} diff --git a/mlir/cuda-tile/sample/gpu.mlir b/mlir/cuda-tile/sample/gpu.mlir new file mode 100644 index 0000000..1149b17 --- /dev/null +++ b/mlir/cuda-tile/sample/gpu.mlir @@ -0,0 +1,13 @@ +toy.gpu_func @my_kernel(%arg0: tensor<2x3xf32>, %arg1: tensor<3x2xf32>) -> tensor<2x2xf32> { + %2 = toy.matmul(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3x2xf32>) to tensor<2x2xf32> + toy.return %2 : tensor<2x2xf32> +} + +toy.func @main() { + %1 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32> + %3 = toy.constant dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf32> + %4 = toy.launch_gpu @my_kernel(%1, %3) {grid = [16, 16, 1]} + : (tensor<2x3xf32>, tensor<3x2xf32>) -> tensor<2x2xf32> + toy.print %4 : tensor<2x2xf32> + toy.return +} \ No newline at end of file diff --git a/mlir/cuda-tile/sample/lowering-llvm.sh b/mlir/cuda-tile/sample/lowering-llvm.sh new file mode 100644 index 0000000..31275da --- /dev/null +++ b/mlir/cuda-tile/sample/lowering-llvm.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +./third_party/llvm/bin/mlir-opt sample/gpu-func.mlir \ + -canonicalize -cse \ + -convert-scf-to-cf \ + -convert-arith-to-llvm \ + -convert-math-to-llvm \ + -finalize-memref-to-llvm \ + -convert-func-to-llvm \ + -reconcile-unrealized-casts \ + -o lowered-llvm-dialect.mlir + +./third_party/llvm/bin/mlir-translate lowered-llvm-dialect.mlir --mlir-to-llvmir -o lowered.ll + +clang++ -O2 lowered.ll cuda_shim/cuda_shim.cc \ + -I/usr/local/cuda/include \ + -L/usr/lib/x86_64-linux-gnu \ + -lcuda -ldl -lpthread -o cuda_shim/a.out diff --git a/mlir/cuda-tile/sample/matmul.toy b/mlir/cuda-tile/sample/matmul.toy new file mode 100644 index 0000000..a3a7406 --- /dev/null +++ b/mlir/cuda-tile/sample/matmul.toy @@ -0,0 +1,16 @@ +def main() { + # Define a variable `a` with shape <2, 3>, initialized with the literal value. + # The shape is inferred from the supplied literal. + var a = [[1, 2, 3], [4, 5, 6]]; + + # b is identical to a, the literal tensor is implicitly reshaped: defining new + # variables is the way to reshape tensors (element count must match). + var b<2, 3> = [11, 12, 13, 14, 15, 16]; + + # transpose() and print() are the only builtin, the following will transpose + # a and b and perform an element-wise multiplication before printing the result. + # print(a * b + b); + print(matmul(a, transpose(b))); + var c<2, 3> = [[7, 8, 9], [10, 11, 12]]; + print(a * c + b); +} diff --git a/mlir/cuda-tile/sample/matmul.toy.mlir b/mlir/cuda-tile/sample/matmul.toy.mlir new file mode 100644 index 0000000..5a0cd7e --- /dev/null +++ b/mlir/cuda-tile/sample/matmul.toy.mlir @@ -0,0 +1,16 @@ +toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { + %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> + %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> + %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64> + toy.return %2 : tensor<*xf64> +} + +toy.func @main() { + %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> + %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> + %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> + %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64> + %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64> + toy.print %4 : tensor<*xf64> + toy.return +} diff --git a/mlir/cuda-tile/sample/validation.py b/mlir/cuda-tile/sample/validation.py new file mode 100644 index 0000000..0dfaa8f --- /dev/null +++ b/mlir/cuda-tile/sample/validation.py @@ -0,0 +1,68 @@ +import numpy as np + +a = np.array([2.925513671, 6.013753211, 3.436855551, 4.960371591, + 4.681581191, 5.326281361, 5.998438671, 8.386758501, + 9.752656342, 7.606886601, 8.787891511, 5.022086611, + 5.778500111, 1.956275721, 3.159872631, 4.753185801, + 4.316574111, 9.194122231, 2.881049471, 3.916830801, + 3.263586051, 4.109766771, 5.528311631, 1.539308991, + 5.302321521, 1.503789272, 2.398510571, 5.142972961, + 1.797829151, 8.899659632, 3.695569372, 8.029364891, + 1.884190321, 3.671349811, 9.706177531, 2.866185471, + 1.073066151, 2.093449371, 3.606837071, 5.202953681, + 6.973824941, 9.923411781, 4.807603021, 6.136584111, + 9.463426811, 9.934010851, 9.836562501, 2.048052841, + 8.976619421, 4.930650072, 5.020764221, 1.101745381, + 6.731029581, 3.154495471, 2.699037381, 4.032752141, + 1.474981521, 4.788043021, 7.608503651, 6.209790112, + 5.128346451, 9.150372381, 3.007619561, 3.416143261, + 1.826820941, 7.537852171, 5.316156761, 5.950802521, + 1.518586911, 1.001132911, 2.004818211, 2.139243582, + 9.806134011, 1.351495721, 4.017863671, 4.146281171, + 1.592400301, 9.376376831, 1.980771871, 6.190763001, + 8.168091671, 2.600631121, 2.321023891, 5.794502551, + 7.501876551, 5.399619291, 2.130964981, 4.139401911, + 5.462168121, 6.526603071, 4.334487231, 2.192541331, + 3.127341681, 6.315157581, 5.794951491, 7.532202441, + 8.384152712, 1.147242581, 1.996822871, 7.156064043, + 7.863002391, 5.663554241, 4.847668712, 8.658924581, + 7.971906902, 1.862331661, 2.062700981, 5.801351942, + 9.898728741, 6.012050671, 3.713753551, 9.637241861, + 8.075792551, 5.958895871, 9.693631161, 7.412511441, + 2.318546031, 3.472298001, 6.612168271, 4.795551511, + 7.590619801, 3.205424901, 4.464052241, 8.846670731, + 8.922903701, 2.567556341, 2.973210511, 2.289122891, + 1.104611521, 1.238846451, 4.810462721, 7.478858471, + 6.446008071, 4.210157321, 9.251681432, 6.314803181, + 7.613188801, 7.390693661, 9.741470081, 6.972811172, + 6.071724461, 4.497292381, 7.952869721, 8.657016631, + 1.621084282, 7.315808571, 7.997440491, 8.171035271, + 1.666892991, 9.494730721, 8.183356871, 8.251520681, + 3.532738031, 9.830194771, 3.670915731, 6.443815381, + 5.592103051, 6.126956221, 1.812470391, 5.291468291, + 9.935006871, 1.537468861, 4.185508861, 1.559862371, + 6.173374171, 5.120825331, 9.526368891, 7.001111811, + 2.550614511, 8.532620091, 6.003902941, 6.713298151, + 9.577618433, 8.392393061, 7.069470191, 9.896941411, + 9.897729712, 6.371231981, 6.560693141, 7.791094681, + 1.016464861, 3.457840971, 4.575334621, 7.167314761, + 2.742804381, 5.896408001, 6.789680541, 6.004492881, + 1.284152471, 5.443332191, 4.528292101, 1.636873071, + 5.688349201, 5.126366571, 2.749963801, 8.144933001, + 9.489360822, 5.326660891, 7.052969211, 8.349262171, + 4.898597061, 6.857033311, 6.328954281, 7.333406851, + 2.588288651, 5.066354511, 9.736569781, 2.309512701, + 5.342955001, 7.282803981, 4.335288321, 6.845356621, + 4.755166231, 9.859116961, 2.263494621, 3.661889861, + 7.990642211, 1.168159651, 4.607092851, 5.881411521, + 5.437404021, 3.550808271, 6.298701342, 7.403695201, + 8.327089262, 8.217354791, 2.330936821, 3.294671993, + 1.249379871, 5.940774332, 7.761280541, 2.763552581, + 6.695427572, 7.014880361, 1.449679711, 7.297992291, + 1.819770801, 8.467591604, 6.751986351, 5.163341051, + 2.313086431, 7.699937171, 4.172433101, 8.029572921, + 8.581235361, 7.306871671, 7.148568021, 5.411956301, + 2.408045811, 1.700581061, 5.210921901, 6.552318091, + 3.749144091, 1.089704561, 3.245265091, 5.736190581]).reshape(16, 16) + +print(a @ a.T) diff --git a/mlir/cuda-tile/scripts/apply_patch.sh b/mlir/cuda-tile/scripts/apply_patch.sh new file mode 100644 index 0000000..bdc4b67 --- /dev/null +++ b/mlir/cuda-tile/scripts/apply_patch.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +rm -rf Ch8 +cp -R Ch7 Ch8 +cd Ch8 +git apply ../scripts/patch/matmul.back.patch diff --git a/mlir/cuda-tile/scripts/build_cuda_tile.sh b/mlir/cuda-tile/scripts/build_cuda_tile.sh new file mode 100644 index 0000000..c74c728 --- /dev/null +++ b/mlir/cuda-tile/scripts/build_cuda_tile.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [[ -f "/usr/bin/git" ]]; then + WORKSPACEROOT=$(git rev-parse --show-toplevel)/mlir/cuda-tile || WORKSPACEROOT=`pwd` +fi + +echo "Building cuda-tile IR in ${WORKSPACEROOT}/third_party/cuda-tile" + +cd ${WORKSPACEROOT}/third_party/cuda-tile + +git checkout -q -- . + +rm -rf build + +cmake -G Ninja -S ${WORKSPACEROOT}/third_party/cuda-tile -B build \ + -DCMAKE_BUILD_TYPE=Debug \ + -DLLVM_ENABLE_ASSERTIONS=OFF \ + -DCUDA_TILE_ENABLE_BINDINGS_PYTHON=OFF \ + -DCUDA_TILE_ENABLE_TESTING=OFF \ + -DCMAKE_INSTALL_PREFIX=${WORKSPACEROOT}/third_party/cuda \ + -DCUDA_TILE_USE_LLVM_INSTALL_DIR=${WORKSPACEROOT}/third_party/llvm + +cmake --build build + +cd build +cmake --install . diff --git a/mlir/cuda-tile/scripts/build_deps.sh b/mlir/cuda-tile/scripts/build_deps.sh new file mode 100644 index 0000000..ef5314e --- /dev/null +++ b/mlir/cuda-tile/scripts/build_deps.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +#if [[ $# -ne 2 ]] ; then +# echo "Usage: $0 " +# exit 1 +#fi + +if [[ -f "/usr/bin/git" ]]; then + WORKSPACEROOT=$(git rev-parse --show-toplevel)/mlir/cuda-tile || WORKSPACEROOT=`pwd` +fi + +cd ${WORKSPACEROOT} + +# LLVM source +LLVM_SRC_DIR="${1:-${WORKSPACEROOT}/third_party/llvm-project}" +build_dir="${LLVM_SRC_DIR}/build" +install_dir="${2:-${WORKSPACEROOT}/third_party/llvm}" + +if ! [ -f "$LLVM_SRC_DIR/llvm/CMakeLists.txt" ]; then + echo "Expected the path to LLVM to be set correctly (got '$LLVM_SRC_DIR'): can't find CMakeLists.txt" + exit 1 +fi +echo "Using LLVM source dir: $LLVM_SRC_DIR" + +# Setup directories. +echo "Building MLIR in $build_dir" +rm -rf "$build_dir" +mkdir -p "$build_dir" + +echo "Installing MLIR in $install_dir" +rm -rf ${install_dir} +mkdir -p ${install_dir} + +echo "Beginning build (commands will echo)" +set -x + +cd $LLVM_SRC_DIR + +cmake -GNinja \ + "-H llvm" \ + "-B $build_dir" \ + -DCMAKE_BUILD_TYPE=Debug \ + -DLLVM_ENABLE_PROJECTS=mlir \ + -DLLVM_TARGETS_TO_BUILD="X86;NVPTX;AMDGPU" \ + -DLLVM_ENABLE_LLD=OFF \ + -DLLVM_ENABLE_BACKTRACES=OFF \ + -DLLVM_INCLUDE_UTILS=ON \ + -DCMAKE_INSTALL_PREFIX=${install_dir} \ + -DLLVM_INSTALL_UTILS=ON \ + -DLLVM_BUILD_UTILS=ON \ + -DLLVM_INCLUDE_TOOLS=ON \ + -DLLVM_BUILD_TOOLS=ON \ + -DLLVM_BUILD_LLVM_DYLIB=ON \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DMLIR_ENABLE_CUDA_RUNNER=ON \ + -DCMAKE_C_COMPILER=clang \ + -DLLVM_LINK_LLVM_DYLIB=ON + + # -DLLVM_ENABLE_RTTI=ON \ + # -DLLVM_ENABLE_LIBEDIT=OFF \ + # -DLLVM_ENABLE_BINDINGS=OFF \ + # -DLLVM_INCLUDE_DOCS=OFF \ + # -DLLVM_INCLUDE_TESTS=ON \ + # -DLLVM_INCLUDE_BENCHMARKS=OFF \ + # -DLLVM_ENABLE_BACKTRACES=ON \ + # -DLLVM_INCLUDE_EXAMPLES=OFF \ + # -DLLVM_ENABLE_ASSERTIONS=On + # -DBUILD_SHARED_LIBS=ON \ + +# cmake --build "$build_dir" +cmake --build "$build_dir" + +pushd "$build_dir" +ninja install +popd + +# bash scripts/build_cuda_tile.sh diff --git a/mlir/cuda-tile/scripts/make_patch.sh b/mlir/cuda-tile/scripts/make_patch.sh new file mode 100644 index 0000000..d82ebdd --- /dev/null +++ b/mlir/cuda-tile/scripts/make_patch.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# Run under the workspace root dir + +diff -urN Ch7 Ch8 > scripts/patch/matmul.patch diff --git a/mlir/cuda-tile/scripts/patch/matmul.back.patch b/mlir/cuda-tile/scripts/patch/matmul.back.patch new file mode 100644 index 0000000..6632148 --- /dev/null +++ b/mlir/cuda-tile/scripts/patch/matmul.back.patch @@ -0,0 +1,373 @@ +diff -urN Ch7/CMakeLists.txt Ch8/CMakeLists.txt +--- Ch7/CMakeLists.txt 2023-12-06 04:57:18.788273480 +0000 ++++ Ch8/CMakeLists.txt 2024-10-01 13:51:09.920421616 +0000 +@@ -6,10 +6,10 @@ + + set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td) + mlir_tablegen(ToyCombine.inc -gen-rewriters) +-add_public_tablegen_target(ToyCh7CombineIncGen) ++add_public_tablegen_target(ToyCh8CombineIncGen) + + add_executable( +- mlir-example-ch7 ++ mlir-example-ch8 + toyc.cpp + parser/AST.cpp + mlir/MLIRGen.cpp +@@ -19,8 +19,8 @@ + mlir/ShapeInferencePass.cpp + mlir/ToyCombine.cpp) + +-add_dependencies(mlir-example-ch7 ToyCh7ShapeInferenceInterfaceIncGen +- ToyCh7OpsIncGen ToyCh7CombineIncGen) ++add_dependencies(mlir-example-ch8 ToyCh8ShapeInferenceInterfaceIncGen ++ ToyCh8OpsIncGen ToyCh8CombineIncGen) + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/) +@@ -28,7 +28,7 @@ + get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) + get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS) + target_link_libraries( +- mlir-example-ch7 ++ mlir-example-ch8 + PRIVATE ${dialect_libs} + ${conversion_libs} + ${extension_libs} +diff -urN Ch7/include/toy/AST.h Ch8/include/toy/AST.h +--- Ch7/include/toy/AST.h 2024-09-22 10:55:44.710339034 +0000 ++++ Ch8/include/toy/AST.h 2024-10-01 13:51:14.420421786 +0000 +@@ -20,9 +20,9 @@ + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/StringRef.h" + #include "llvm/Support/Casting.h" ++#include + #include + #include +-#include + + namespace toy { + +diff -urN Ch7/include/toy/CMakeLists.txt Ch8/include/toy/CMakeLists.txt +--- Ch7/include/toy/CMakeLists.txt 2023-12-06 04:57:18.788273480 +0000 ++++ Ch8/include/toy/CMakeLists.txt 2024-10-01 13:51:15.848421840 +0000 +@@ -4,10 +4,10 @@ + mlir_tablegen(Ops.cpp.inc -gen-op-defs) + mlir_tablegen(Dialect.h.inc -gen-dialect-decls) + mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs) +-add_public_tablegen_target(ToyCh7OpsIncGen) ++add_public_tablegen_target(ToyCh8OpsIncGen) + + # Most dialects should use add_mlir_interfaces(). + set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td) + mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls) + mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs) +-add_public_tablegen_target(ToyCh7ShapeInferenceInterfaceIncGen) ++add_public_tablegen_target(ToyCh8ShapeInferenceInterfaceIncGen) +diff -urN Ch7/include/toy/Ops.td Ch8/include/toy/Ops.td +--- Ch7/include/toy/Ops.td 2024-09-22 10:55:44.710339034 +0000 ++++ Ch8/include/toy/Ops.td 2024-10-01 13:51:17.112421888 +0000 +@@ -450,4 +450,31 @@ + let hasVerifier = 1; + } + ++//===----------------------------------------------------------------------===// ++// MatMul Op ++//===----------------------------------------------------------------------===// ++ ++def MatMulOp : Toy_Op<"matmul", ++ [Pure, DeclareOpInterfaceMethods]> { ++ let summary = "matrix multiplication operation"; ++ let description = [{ ++ The "matmul" operation performs Matrix multiplication between two ++ tensors. The shapes of the tensor operands are expected to match. ++ }]; ++ ++ let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs); ++ let results = (outs F64Tensor); ++ ++ let assemblyFormat = [{ ++ `(` $lhs `:` type($lhs) `,` $rhs `:` type($rhs) `)` attr-dict `to` type(results) ++ }]; ++ ++ // Allow building a MatMulOp with from the two input operands. ++ let builders = [ ++ OpBuilder<(ins "Value":$lhs, "Value":$rhs)> ++ ]; ++ ++ let hasVerifier = 1; ++} ++ + #endif // TOY_OPS +diff -urN Ch7/include/toy/Parser.h Ch8/include/toy/Parser.h +--- Ch7/include/toy/Parser.h 2024-09-22 10:55:44.714339101 +0000 ++++ Ch8/include/toy/Parser.h 2024-10-01 13:51:18.412421937 +0000 +@@ -22,9 +22,9 @@ + #include "llvm/Support/raw_ostream.h" + + #include ++#include + #include + #include +-#include + + namespace toy { + +diff -urN Ch7/matmul.toy Ch8/matmul.toy +--- Ch7/matmul.toy 1970-01-01 00:00:00.000000000 +0000 ++++ Ch8/matmul.toy 2024-10-01 13:51:11.744421685 +0000 +@@ -0,0 +1,14 @@ ++def main() { ++ # Define a variable `a` with shape <2, 3>, initialized with the literal value. ++ # The shape is inferred from the supplied literal. ++ var a = [[1, 2, 3], [4, 5, 6]]; ++ ++ # b is identical to a, the literal tensor is implicitly reshaped: defining new ++ # variables is the way to reshape tensors (element count must match). ++ var b<2, 3> = [1, 2, 3, 4, 5, 6]; ++ ++ # transpose() and print() are the only builtin, the following will transpose ++ # a and b and perform an element-wise multiplication before printing the result. ++ # print(a * b + b); ++ print(matmul(a, transpose(b))); ++} +diff -urN Ch7/matmul.toy.mlir Ch8/matmul.toy.mlir +--- Ch7/matmul.toy.mlir 1970-01-01 00:00:00.000000000 +0000 ++++ Ch8/matmul.toy.mlir 2024-10-01 13:51:13.056421735 +0000 +@@ -0,0 +1,16 @@ ++toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { ++ %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> ++ %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> ++ %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64> ++ toy.return %2 : tensor<*xf64> ++} ++ ++toy.func @main() { ++ %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> ++ %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> ++ %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> ++ %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64> ++ %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64> ++ toy.print %4 : tensor<*xf64> ++ toy.return ++} +diff -urN Ch7/mlir/Dialect.cpp Ch8/mlir/Dialect.cpp +--- Ch7/mlir/Dialect.cpp 2024-09-22 10:55:44.714339101 +0000 ++++ Ch8/mlir/Dialect.cpp 2024-10-01 13:51:19.988421996 +0000 +@@ -13,6 +13,7 @@ + + #include "toy/Dialect.h" + ++#include "mlir/Dialect/Arith/Utils/Utils.h" + #include "mlir/IR/Attributes.h" + #include "mlir/IR/Builders.h" + #include "mlir/IR/BuiltinAttributes.h" +@@ -429,7 +430,8 @@ + auto resultType = results.front(); + + // Check that the result type of the function matches the operand type. +- if (inputType == resultType || llvm::isa(inputType) || ++ if (inputType == resultType || ++ llvm::isa(inputType) || + llvm::isa(resultType)) + return mlir::success(); + +@@ -497,6 +499,58 @@ + return mlir::success(); + } + ++//===----------------------------------------------------------------------===// ++// MatMulOp ++//===----------------------------------------------------------------------===// ++ ++void MatMulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, ++ mlir::Value lhs, mlir::Value rhs) { ++ state.addTypes(UnrankedTensorType::get(builder.getF64Type())); ++ state.addOperands({lhs, rhs}); ++} ++ ++/// Infer the output shape of the MatMulOp, this is required by the shape ++/// inference interface. ++void MatMulOp::inferShapes() { ++ RankedTensorType lhsType = ++ llvm::dyn_cast(getLhs().getType()); ++ RankedTensorType rhsType = ++ llvm::dyn_cast(getRhs().getType()); ++ auto lhsShape = lhsType.getShape(); ++ auto rhsShape = rhsType.getShape(); ++ RankedTensorType res_type = RankedTensorType::get({lhsShape[0], rhsShape[1]}, ++ lhsType.getElementType()); ++ getResult().setType(res_type); ++} ++ ++llvm::LogicalResult MatMulOp::verify() { ++ auto lhsType = llvm::dyn_cast(getLhs().getType()); ++ auto rhsType = llvm::dyn_cast(getRhs().getType()); ++ auto resultType = llvm::dyn_cast(getType()); ++ ++ if (!lhsType || !rhsType || !resultType) ++ return mlir::success(); ++ ++ auto lhsShape = lhsType.getShape(); ++ auto rhsShape = rhsType.getShape(); ++ ++ if (lhsShape.size() != 2 || rhsShape.size() != 2) { ++ return emitOpError() << "expected 2D matrix"; ++ } ++ ++ if (lhsShape[1] != rhsShape[0]) { ++ return emitOpError() << "expected dimension to match" ++ << "the shape of lhs is [" << lhsShape[0] << ", " ++ << lhsShape[1] << "] " ++ << "the shape of rhs is [" << rhsShape[0] << ", " ++ << rhsShape[1] << "] " ++ << "but the dimension " << lhsShape[1] ++ << "!=" << rhsShape[0] << '\n'; ++ } ++ ++ return mlir::success(); ++} ++ + //===----------------------------------------------------------------------===// + // Toy Types + //===----------------------------------------------------------------------===// +diff -urN Ch7/mlir/LowerToAffineLoops.cpp Ch8/mlir/LowerToAffineLoops.cpp +--- Ch7/mlir/LowerToAffineLoops.cpp 2024-09-22 10:55:44.714339101 +0000 ++++ Ch8/mlir/LowerToAffineLoops.cpp 2024-10-01 13:51:21.668422059 +0000 +@@ -19,6 +19,7 @@ + #include "mlir/IR/Diagnostics.h" + #include "mlir/IR/DialectRegistry.h" + #include "mlir/IR/PatternMatch.h" ++#include "mlir/IR/Value.h" + #include "mlir/IR/ValueRange.h" + #include "mlir/Support/LLVM.h" + #include "mlir/Support/TypeID.h" +@@ -31,6 +32,7 @@ + #include "mlir/Dialect/MemRef/IR/MemRef.h" + #include "mlir/Pass/Pass.h" + #include "mlir/Transforms/DialectConversion.h" ++#include "llvm/ADT/APFloat.h" + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/STLExtras.h" + #include "llvm/ADT/Sequence.h" +@@ -315,6 +317,91 @@ + } + }; + ++//===----------------------------------------------------------------------===// ++// ToyToAffine RewritePatterns: MatMul operations ++//===----------------------------------------------------------------------===// ++ ++struct MatMulOpLowering : public ConversionPattern { ++ MatMulOpLowering(MLIRContext *ctx) ++ : ConversionPattern(toy::MatMulOp::getOperationName(), 1, ctx) {} ++ ++ LogicalResult ++ matchAndRewrite(Operation *op, ArrayRef operands, ++ ConversionPatternRewriter &rewriter) const final { ++ auto loc = op->getLoc(); ++ ++ RankedTensorType lhsType = ++ llvm::dyn_cast(op->getOperand(0).getType()); ++ RankedTensorType rhsType = ++ llvm::dyn_cast(op->getOperand(1).getType()); ++ auto lhsShape = lhsType.getShape(); ++ auto rhsShape = rhsType.getShape(); ++ ++ auto tensorType = ++ llvm::dyn_cast((*op->result_type_begin())); ++ ++ auto elemType = llvm::dyn_cast(tensorType.getElementType()); ++ ++ // Insert an allocation and deallocation for the result of this operation. ++ auto memRefType = convertTensorToMemRef(tensorType); ++ auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter); ++ ++ SmallVector lowerBounds(tensorType.getRank() + 1, /*Value=*/0); ++ SmallVector steps(tensorType.getRank() + 1, /*Value=*/1); ++ SmallVector upperBounds{lhsShape[0], rhsShape[0], rhsShape[1]}; ++ ++ // add initialization of result tensor. ++ // Create a nest of affine loops to initialize the result tensor to 0. ++ affine::buildAffineLoopNest( ++ rewriter, loc, {0, 0}, tensorType.getShape(), {1, 1}, ++ [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { ++ // Create a constant float value of 0.0. ++ auto valueToStore = nestedBuilder.create( ++ loc, llvm::APFloat(0.0), elemType); ++ // Store the constant value into the allocated memory. ++ nestedBuilder.create(loc, valueToStore, alloc, ++ ivs); ++ }); ++ ++ // Create a nest of affine loops for matrix multiplication. ++ affine::buildAffineLoopNest( ++ rewriter, loc, lowerBounds, upperBounds, steps, ++ [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { ++ // Extract loop induction variables. ++ Value m = ivs[0]; ++ Value k = ivs[1]; ++ Value n = ivs[2]; ++ ++ // Create an adaptor for the remapped operands of the MatMulOp. ++ toy::MatMulOpAdaptor matmulAdaptor(operands); ++ ++ // Load elements from the left-hand side and right-hand side matrices. ++ auto loadedLhs = nestedBuilder.create( ++ loc, matmulAdaptor.getLhs(), ValueRange{m, k}); ++ auto loadedRhs = nestedBuilder.create( ++ loc, matmulAdaptor.getRhs(), ValueRange{k, n}); ++ // Load elements from the result tensor from initial process above. ++ auto loadedRes = nestedBuilder.create( ++ loc, alloc, ValueRange{m, n}); ++ ++ // Perform the multiplication and addition operations. ++ auto mulop = ++ nestedBuilder.create(loc, loadedLhs, loadedRhs); ++ auto valueToStore = ++ nestedBuilder.create(loc, loadedRes, mulop); ++ ++ // Store the result back into the allocated memory. ++ nestedBuilder.create(loc, valueToStore, alloc, ++ ValueRange{m, n}); ++ }); ++ ++ // Replace this operation with the generated alloc. ++ rewriter.replaceOp(op, alloc); ++ ++ return success(); ++ } ++}; ++ + } // namespace + + //===----------------------------------------------------------------------===// +@@ -365,8 +452,8 @@ + // the set of patterns that will lower the Toy operations. + RewritePatternSet patterns(&getContext()); + patterns.add( +- &getContext()); ++ PrintOpLowering, ReturnOpLowering, TransposeOpLowering, ++ MatMulOpLowering>(&getContext()); + + // With the target and rewrite patterns defined, we can now attempt the + // conversion. The conversion will signal failure if any of our `illegal` +diff -urN Ch7/mlir/MLIRGen.cpp Ch8/mlir/MLIRGen.cpp +--- Ch7/mlir/MLIRGen.cpp 2024-09-22 10:55:44.714339101 +0000 ++++ Ch8/mlir/MLIRGen.cpp 2024-10-01 13:51:23.564422131 +0000 +@@ -525,6 +525,14 @@ + return builder.create(location, operands[0]); + } + ++ if (callee == "matmul") { ++ if (call.getArgs().size() != 2) { ++ emitError(location, "MLIR codegen encountered an error: toy.matmul " ++ "expected 2 arguments"); ++ } ++ return builder.create(location, operands[0], operands[1]); ++ } ++ + // Otherwise this is a call to a user-defined function. Calls to + // user-defined functions are mapped to a custom call that takes the callee + // name as an attribute. diff --git a/mlir/cuda-tile/scripts/patch/matmul.patch b/mlir/cuda-tile/scripts/patch/matmul.patch new file mode 100644 index 0000000..d3f09d6 --- /dev/null +++ b/mlir/cuda-tile/scripts/patch/matmul.patch @@ -0,0 +1,375 @@ +diff -urN Ch7/CMakeLists.txt Ch8/CMakeLists.txt +--- Ch7/CMakeLists.txt 2025-12-29 12:11:15.106203203 +0000 ++++ Ch8/CMakeLists.txt 2025-12-29 12:11:15.110203203 +0000 +@@ -6,10 +6,10 @@ + + set(LLVM_TARGET_DEFINITIONS mlir/ToyCombine.td) + mlir_tablegen(ToyCombine.inc -gen-rewriters) +-add_public_tablegen_target(ToyCh7CombineIncGen) ++add_public_tablegen_target(ToyCh8CombineIncGen) + + add_executable( +- mlir-example-ch7 ++ mlir-example-ch8 + toyc.cpp + parser/AST.cpp + mlir/MLIRGen.cpp +@@ -19,8 +19,8 @@ + mlir/ShapeInferencePass.cpp + mlir/ToyCombine.cpp) + +-add_dependencies(mlir-example-ch7 ToyCh7ShapeInferenceInterfaceIncGen +- ToyCh7OpsIncGen ToyCh7CombineIncGen) ++add_dependencies(mlir-example-ch8 ToyCh8ShapeInferenceInterfaceIncGen ++ ToyCh8OpsIncGen ToyCh8CombineIncGen) + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${CMAKE_CURRENT_BINARY_DIR}/include/) +@@ -28,7 +28,7 @@ + get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) + get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS) + target_link_libraries( +- mlir-example-ch7 ++ mlir-example-ch8 + PRIVATE ${dialect_libs} + ${conversion_libs} + ${extension_libs} +diff -urN Ch7/include/toy/AST.h Ch8/include/toy/AST.h +--- Ch7/include/toy/AST.h 2025-12-29 12:11:15.107203203 +0000 ++++ Ch8/include/toy/AST.h 2025-12-29 12:11:15.110203203 +0000 +@@ -20,9 +20,9 @@ + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/StringRef.h" + #include "llvm/Support/Casting.h" ++#include + #include + #include +-#include + + namespace toy { + +diff -urN Ch7/include/toy/CMakeLists.txt Ch8/include/toy/CMakeLists.txt +--- Ch7/include/toy/CMakeLists.txt 2025-12-29 12:11:15.107203203 +0000 ++++ Ch8/include/toy/CMakeLists.txt 2025-12-29 12:11:15.110203203 +0000 +@@ -4,10 +4,10 @@ + mlir_tablegen(Ops.cpp.inc -gen-op-defs) + mlir_tablegen(Dialect.h.inc -gen-dialect-decls) + mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs) +-add_public_tablegen_target(ToyCh7OpsIncGen) ++add_public_tablegen_target(ToyCh8OpsIncGen) + + # Most dialects should use add_mlir_interfaces(). + set(LLVM_TARGET_DEFINITIONS ShapeInferenceInterface.td) + mlir_tablegen(ShapeInferenceOpInterfaces.h.inc -gen-op-interface-decls) + mlir_tablegen(ShapeInferenceOpInterfaces.cpp.inc -gen-op-interface-defs) +-add_public_tablegen_target(ToyCh7ShapeInferenceInterfaceIncGen) ++add_public_tablegen_target(ToyCh8ShapeInferenceInterfaceIncGen) +diff -urN Ch7/include/toy/Ops.td Ch8/include/toy/Ops.td +--- Ch7/include/toy/Ops.td 2025-12-29 12:11:15.108203203 +0000 ++++ Ch8/include/toy/Ops.td 2025-12-29 12:11:15.111203203 +0000 +@@ -450,4 +450,33 @@ + let hasVerifier = 1; + } + ++//===----------------------------------------------------------------------===// ++// MatMul Op ++//===----------------------------------------------------------------------===// ++ ++def MatMulOp : Toy_Op<"matmul", ++ [Pure, DeclareOpInterfaceMethods, MemoryEffectsOpInterface]> { ++ let summary = "matrix multiplication operation"; ++ let description = [{ ++ The "matmul" operation performs Matrix multiplication between two ++ tensors. The shapes of the tensor operands are expected to match. ++ }]; ++ ++ let arguments = (ins F64Tensor:$lhs, F64Tensor:$rhs); ++ let results = (outs Res, ++ MemAlloc]>:$output); ++ ++ let assemblyFormat = [{ ++ `(` $lhs `:` type($lhs) `,` $rhs `:` type($rhs) `)` attr-dict `to` type(results) ++ }]; ++ ++ // Allow building a MatMulOp with from the two input operands. ++ let builders = [ ++ OpBuilder<(ins "Value":$lhs, "Value":$rhs)> ++ ]; ++ ++ let hasVerifier = 1; ++} ++ + #endif // TOY_OPS +diff -urN Ch7/include/toy/Parser.h Ch8/include/toy/Parser.h +--- Ch7/include/toy/Parser.h 2025-12-29 12:11:15.108203203 +0000 ++++ Ch8/include/toy/Parser.h 2025-12-29 12:11:15.111203203 +0000 +@@ -22,9 +22,9 @@ + #include "llvm/Support/raw_ostream.h" + + #include ++#include + #include + #include +-#include + + namespace toy { + +diff -urN Ch7/matmul.toy Ch8/matmul.toy +--- Ch7/matmul.toy 1970-01-01 00:00:00.000000000 +0000 ++++ Ch8/matmul.toy 2025-12-29 12:11:15.111203203 +0000 +@@ -0,0 +1,14 @@ ++def main() { ++ # Define a variable `a` with shape <2, 3>, initialized with the literal value. ++ # The shape is inferred from the supplied literal. ++ var a = [[1, 2, 3], [4, 5, 6]]; ++ ++ # b is identical to a, the literal tensor is implicitly reshaped: defining new ++ # variables is the way to reshape tensors (element count must match). ++ var b<2, 3> = [1, 2, 3, 4, 5, 6]; ++ ++ # transpose() and print() are the only builtin, the following will transpose ++ # a and b and perform an element-wise multiplication before printing the result. ++ # print(a * b + b); ++ print(matmul(a, transpose(b))); ++} +diff -urN Ch7/matmul.toy.mlir Ch8/matmul.toy.mlir +--- Ch7/matmul.toy.mlir 1970-01-01 00:00:00.000000000 +0000 ++++ Ch8/matmul.toy.mlir 2025-12-29 12:11:15.111203203 +0000 +@@ -0,0 +1,16 @@ ++toy.func private @matmul_transpose(%arg0: tensor<*xf64>, %arg1: tensor<*xf64>) -> tensor<*xf64> { ++ %0 = toy.transpose(%arg0 : tensor<*xf64>) to tensor<*xf64> ++ %1 = toy.transpose(%arg1 : tensor<*xf64>) to tensor<*xf64> ++ %2 = toy.matmul(%0 : tensor<*xf64>, %1 : tensor<*xf64>) to tensor<*xf64> ++ toy.return %2 : tensor<*xf64> ++} ++ ++toy.func @main() { ++ %0 = toy.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64> ++ %1 = toy.reshape(%0 : tensor<2x3xf64>) to tensor<2x3xf64> ++ %2 = toy.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00]> : tensor<6xf64> ++ %3 = toy.reshape(%2 : tensor<6xf64>) to tensor<3x2xf64> ++ %4 = toy.generic_call @matmul_transpose(%1, %3) : (tensor<2x3xf64>, tensor<3x2xf64>) -> tensor<*xf64> ++ toy.print %4 : tensor<*xf64> ++ toy.return ++} +diff -urN Ch7/mlir/Dialect.cpp Ch8/mlir/Dialect.cpp +--- Ch7/mlir/Dialect.cpp 2025-12-29 12:11:15.108203203 +0000 ++++ Ch8/mlir/Dialect.cpp 2025-12-29 12:11:15.111203203 +0000 +@@ -13,6 +13,7 @@ + + #include "toy/Dialect.h" + ++#include "mlir/Dialect/Arith/Utils/Utils.h" + #include "mlir/IR/Attributes.h" + #include "mlir/IR/Builders.h" + #include "mlir/IR/BuiltinAttributes.h" +@@ -429,7 +430,8 @@ + auto resultType = results.front(); + + // Check that the result type of the function matches the operand type. +- if (inputType == resultType || llvm::isa(inputType) || ++ if (inputType == resultType || ++ llvm::isa(inputType) || + llvm::isa(resultType)) + return mlir::success(); + +@@ -497,6 +499,58 @@ + return mlir::success(); + } + ++//===----------------------------------------------------------------------===// ++// MatMulOp ++//===----------------------------------------------------------------------===// ++ ++void MatMulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state, ++ mlir::Value lhs, mlir::Value rhs) { ++ state.addTypes(UnrankedTensorType::get(builder.getF64Type())); ++ state.addOperands({lhs, rhs}); ++} ++ ++/// Infer the output shape of the MatMulOp, this is required by the shape ++/// inference interface. ++void MatMulOp::inferShapes() { ++ RankedTensorType lhsType = ++ llvm::dyn_cast(getLhs().getType()); ++ RankedTensorType rhsType = ++ llvm::dyn_cast(getRhs().getType()); ++ auto lhsShape = lhsType.getShape(); ++ auto rhsShape = rhsType.getShape(); ++ RankedTensorType res_type = RankedTensorType::get({lhsShape[0], rhsShape[1]}, ++ lhsType.getElementType()); ++ getResult().setType(res_type); ++} ++ ++llvm::LogicalResult MatMulOp::verify() { ++ auto lhsType = llvm::dyn_cast(getLhs().getType()); ++ auto rhsType = llvm::dyn_cast(getRhs().getType()); ++ auto resultType = llvm::dyn_cast(getType()); ++ ++ if (!lhsType || !rhsType || !resultType) ++ return mlir::success(); ++ ++ auto lhsShape = lhsType.getShape(); ++ auto rhsShape = rhsType.getShape(); ++ ++ if (lhsShape.size() != 2 || rhsShape.size() != 2) { ++ return emitOpError() << "expected 2D matrix"; ++ } ++ ++ if (lhsShape[1] != rhsShape[0]) { ++ return emitOpError() << "expected dimension to match" ++ << "the shape of lhs is [" << lhsShape[0] << ", " ++ << lhsShape[1] << "] " ++ << "the shape of rhs is [" << rhsShape[0] << ", " ++ << rhsShape[1] << "] " ++ << "but the dimension " << lhsShape[1] ++ << "!=" << rhsShape[0] << '\n'; ++ } ++ ++ return mlir::success(); ++} ++ + //===----------------------------------------------------------------------===// + // Toy Types + //===----------------------------------------------------------------------===// +diff -urN Ch7/mlir/LowerToAffineLoops.cpp Ch8/mlir/LowerToAffineLoops.cpp +--- Ch7/mlir/LowerToAffineLoops.cpp 2025-12-29 12:11:15.109203203 +0000 ++++ Ch8/mlir/LowerToAffineLoops.cpp 2025-12-29 12:11:15.112203203 +0000 +@@ -19,6 +19,7 @@ + #include "mlir/IR/Diagnostics.h" + #include "mlir/IR/DialectRegistry.h" + #include "mlir/IR/PatternMatch.h" ++#include "mlir/IR/Value.h" + #include "mlir/IR/ValueRange.h" + #include "mlir/Support/LLVM.h" + #include "mlir/Support/TypeID.h" +@@ -31,6 +32,7 @@ + #include "mlir/Dialect/MemRef/IR/MemRef.h" + #include "mlir/Pass/Pass.h" + #include "mlir/Transforms/DialectConversion.h" ++#include "llvm/ADT/APFloat.h" + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/STLExtras.h" + #include "llvm/ADT/Sequence.h" +@@ -315,6 +317,91 @@ + } + }; + ++//===----------------------------------------------------------------------===// ++// ToyToAffine RewritePatterns: MatMul operations ++//===----------------------------------------------------------------------===// ++ ++struct MatMulOpLowering : public ConversionPattern { ++ MatMulOpLowering(MLIRContext *ctx) ++ : ConversionPattern(toy::MatMulOp::getOperationName(), 1, ctx) {} ++ ++ LogicalResult ++ matchAndRewrite(Operation *op, ArrayRef operands, ++ ConversionPatternRewriter &rewriter) const final { ++ auto loc = op->getLoc(); ++ ++ RankedTensorType lhsType = ++ llvm::dyn_cast(op->getOperand(0).getType()); ++ RankedTensorType rhsType = ++ llvm::dyn_cast(op->getOperand(1).getType()); ++ auto lhsShape = lhsType.getShape(); ++ auto rhsShape = rhsType.getShape(); ++ ++ auto tensorType = ++ llvm::dyn_cast((*op->result_type_begin())); ++ ++ auto elemType = llvm::dyn_cast(tensorType.getElementType()); ++ ++ // Insert an allocation and deallocation for the result of this operation. ++ auto memRefType = convertTensorToMemRef(tensorType); ++ auto alloc = insertAllocAndDealloc(memRefType, loc, rewriter); ++ ++ SmallVector lowerBounds(tensorType.getRank() + 1, /*Value=*/0); ++ SmallVector steps(tensorType.getRank() + 1, /*Value=*/1); ++ SmallVector upperBounds{lhsShape[0], rhsShape[0], rhsShape[1]}; ++ ++ // add initialization of result tensor. ++ // Create a nest of affine loops to initialize the result tensor to 0. ++ affine::buildAffineLoopNest( ++ rewriter, loc, {0, 0}, tensorType.getShape(), {1, 1}, ++ [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { ++ // Create a constant float value of 0.0. ++ auto valueToStore = nestedBuilder.create( ++ loc, llvm::APFloat(0.0), elemType); ++ // Store the constant value into the allocated memory. ++ nestedBuilder.create(loc, valueToStore, alloc, ++ ivs); ++ }); ++ ++ // Create a nest of affine loops for matrix multiplication. ++ affine::buildAffineLoopNest( ++ rewriter, loc, lowerBounds, upperBounds, steps, ++ [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { ++ // Extract loop induction variables. ++ Value m = ivs[0]; ++ Value k = ivs[1]; ++ Value n = ivs[2]; ++ ++ // Create an adaptor for the remapped operands of the MatMulOp. ++ toy::MatMulOpAdaptor matmulAdaptor(operands); ++ ++ // Load elements from the left-hand side and right-hand side matrices. ++ auto loadedLhs = nestedBuilder.create( ++ loc, matmulAdaptor.getLhs(), ValueRange{m, k}); ++ auto loadedRhs = nestedBuilder.create( ++ loc, matmulAdaptor.getRhs(), ValueRange{k, n}); ++ // Load elements from the result tensor from initial process above. ++ auto loadedRes = nestedBuilder.create( ++ loc, alloc, ValueRange{m, n}); ++ ++ // Perform the multiplication and addition operations. ++ auto mulop = ++ nestedBuilder.create(loc, loadedLhs, loadedRhs); ++ auto valueToStore = ++ nestedBuilder.create(loc, loadedRes, mulop); ++ ++ // Store the result back into the allocated memory. ++ nestedBuilder.create(loc, valueToStore, alloc, ++ ValueRange{m, n}); ++ }); ++ ++ // Replace this operation with the generated alloc. ++ rewriter.replaceOp(op, alloc); ++ ++ return success(); ++ } ++}; ++ + } // namespace + + //===----------------------------------------------------------------------===// +@@ -365,8 +452,8 @@ + // the set of patterns that will lower the Toy operations. + RewritePatternSet patterns(&getContext()); + patterns.add( +- &getContext()); ++ PrintOpLowering, ReturnOpLowering, TransposeOpLowering, ++ MatMulOpLowering>(&getContext()); + + // With the target and rewrite patterns defined, we can now attempt the + // conversion. The conversion will signal failure if any of our `illegal` +diff -urN Ch7/mlir/MLIRGen.cpp Ch8/mlir/MLIRGen.cpp +--- Ch7/mlir/MLIRGen.cpp 2025-12-29 12:11:15.109203203 +0000 ++++ Ch8/mlir/MLIRGen.cpp 2025-12-29 12:11:15.112203203 +0000 +@@ -525,6 +525,14 @@ + return builder.create(location, operands[0]); + } + ++ if (callee == "matmul") { ++ if (call.getArgs().size() != 2) { ++ emitError(location, "MLIR codegen encountered an error: toy.matmul " ++ "expected 2 arguments"); ++ } ++ return builder.create(location, operands[0], operands[1]); ++ } ++ + // Otherwise this is a call to a user-defined function. Calls to + // user-defined functions are mapped to a custom call that takes the callee + // name as an attribute. diff --git a/mlir/cuda-tile/scripts/sync_deps.sh b/mlir/cuda-tile/scripts/sync_deps.sh new file mode 100644 index 0000000..1a383b3 --- /dev/null +++ b/mlir/cuda-tile/scripts/sync_deps.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +mkdir -p third_party + +git clone --filter=blob:none --no-checkout https://github.com/llvm/llvm-project.git third_party/llvm-project +cd third_party/llvm-project + +git fetch --depth=1 origin cfbb4cc31215d615f605466aef0bcfb42aa9faa5 +git checkout --detach cfbb4cc31215d615f605466aef0bcfb42aa9faa5 + +cd - + +git clone https://github.com/Alwaysproblem/cuda-tile third_party/cuda-tile diff --git a/mlir/cuda-tile/scripts/update.sh b/mlir/cuda-tile/scripts/update.sh new file mode 100644 index 0000000..e6cf698 --- /dev/null +++ b/mlir/cuda-tile/scripts/update.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +WORKSPACE=`pwd` + +_llvm_branch=${1:-"release/19.x"} + +_dirs="Ch1 Ch2 Ch3 Ch4 Ch5 Ch6 Ch7" +_transform_dirs="Ch2 Ch3 Ch4" + +_example_in_llvm_project="third_party/llvm-project/mlir/examples" + +_mlir_example_dir="${_example_in_llvm_project}/toy" +_mlir_transform_dir="${_example_in_llvm_project}/transform" + +[[ -d "third_party/llvm-project" ]] || git clone -b $_llvm_branch https://github.com/llvm/llvm-project.git third_party/llvm-project + +# update the mlir Toy examples + +for dir in $_dirs; do + + pushd "$WORKSPACE/$dir" + rm -rf $(find ./ -name "*.cpp") + rm -rf $(find ./ -name "*.h") + rm -rf $(find ./ -name "*.td") + popd + + pushd "$WORKSPACE/${_mlir_example_dir}/$dir" + + for cpps in $(find ./ -name "*.cpp"); do + cp ${cpps} "$WORKSPACE/$dir/${cpps}" + done + + for hs in $(find ./ -name "*.h"); do + cp ${hs} "$WORKSPACE/$dir/${hs}" + done + + for tds in $(find ./ -name "*.td"); do + cp ${tds} "$WORKSPACE/$dir/${tds}" + done + + popd + +done + +# update the mlir transform examples + +for tdir in $_transform_dirs; do + + pushd "$WORKSPACE/transform_$tdir" + rm -rf $(find ./ -name "*.cpp") + rm -rf $(find ./ -name "*.h") + rm -rf $(find ./ -name "*.td") + popd + + pushd "$WORKSPACE/${_mlir_transform_dir}/$tdir" + + for cpps in $(find ./ -name "*.cpp"); do + cp ${cpps} "$WORKSPACE/transform_$tdir/${cpps}" + # echo "cp ${cpps} $WORKSPACE/transform_$tdir/${cpps}" + done + + for hs in $(find ./ -name "*.h"); do + cp ${hs} "$WORKSPACE/transform_$tdir/${hs}" + # echo "cp ${hs} $WORKSPACE/transform_$tdir/${hs}" + done + + for tds in $(find ./ -name "*.td"); do + cp ${tds} "$WORKSPACE/transform_$tdir/${tds}" + # echo "cp ${tds} $WORKSPACE/transform_$tdir/${tds}" + done + + popd + +done diff --git a/mlir/cuda-tile/vscode/.container_zsh_history b/mlir/cuda-tile/vscode/.container_zsh_history new file mode 100644 index 0000000..b53f250 --- /dev/null +++ b/mlir/cuda-tile/vscode/.container_zsh_history @@ -0,0 +1,598 @@ +: 1766924709:0;lsa +: 1766924743:0;cd dockerVolumn +: 1766924744:0;lsa +: 1766924749:0;rm -rf cuda-tile +: 1766924758:0;setup_new_user 1000 1000 +: 1766924767:0;git config --global --add safe.directory '*' +: 1766924791:0;git clone https://github.com/Alwaysproblem/cuda-tile +: 1766924807:0;git checkout dev +: 1766924812:0;cd cuda-tile +: 1766924814:0;git checkout dev +: 1766924904:0;apt install -yq software-properties-common +: 1766924921:0;apt update -y && apt install -yq software-properties-common +: 1766924955:0;apt install -yq gcc-13 g++-13 +: 1766924972:0;g++ --version +: 1766924994:0;add-apt-repository -y ppa:ubuntu-toolchain-r/test +: 1766925008:0;apt update -y +: 1766925020:0;apt install -yq gcc-13 g++-13 +: 1766925151:0;apt install -y \\ + python3 python3-dev python3-setuptools python3-pip \\ + libtinfo-dev zlib1g-dev \\ + build-essential cmake ninja-build +: 1766925174:0;apt install -yq libfmt-dev libspdlog-dev +: 1766925183:0;apt install -yq gcc-13 g++-13 +: 1766925221:0;update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 20 +: 1766925227:0;update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 20 +: 1766925256:0;python3 --version +: 1766925275:0;sudo vim /etc/apt/sources.list.d/llvm.list +: 1766925279:0;vim /etc/apt/sources.list.d/llvm.list +: 1766925344:0;wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc +: 1766925351:0;apt update -y +: 1766925388:0;python3 -m pip install pre-commit compdb +: 1766925427:0;cmake --version +: 1766925470:0;apt install -yq clang-20 clang-tidy-20 clangd-20 cmake-format \\ + clang-format-20 lldb-20 +: 1766926219:0;update-alternatives --install /usr/bin/clang clang /usr/bin/clang-20 100 +: 1766926230:0;update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-20 100 +: 1766926240:0;update-alternatives --install /usr/bin/clangd clangd /usr/bin/clangd-20 100 +: 1766926250:0;update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-20 100 +: 1766926259:0;update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-20 100 +: 1766926269:0;update-alternatives --install /usr/bin/lld lld /usr/bin/lld-20 100 +: 1766926289:0;apt install -yq lld-20 +: 1766926295:0;update-alternatives --install /usr/bin/lld lld /usr/bin/lld-20 100 +: 1766926306:0;update-alternatives --install /usr/bin/lldb lldb /usr/bin/lldb-20 100 +: 1766926321:0;update-alternatives --install /usr/bin/lldb-dap lldb-dap /usr/bin/lldb-dap-20 100 +: 1766926327:0;clang++ --version +: 1766926448:0;lsa +: 1766926460:0;chown -R scotty:scotty ./ +: 1766926461:0;lsa +: 1766926704:0;nvidia-smi +: 1766926730:0;bash build.sh +: 1766926743:0;vim build.sh +: 1766926765:0;bash build.sh +: 1766926778:0;vim build.sh +: 1766926798:0;mkdir 3rdparty +: 1766926820:0;lsa +: 1766926825:0;chown -R scotty:scotty ./ +: 1766926826:0;lsa +: 1766926840:0;cd .. +: 1766926842:0;lsa +: 1766926852:0;cd dockerVolumn +: 1766926853:0;lsa +: 1766926860:0;mv * ../ +: 1766926862:0;cd .. +: 1766926863:0;lsa +: 1766926917:0;cp cuda-tile llvm-project MLcompiler-tutorial dockerVolumn/ -R +: 1766927013:0;lsa +: 1766927028:0;cd dockerVolumn +: 1766927029:0;lsa +: 1766927038:0;chown -R scotty:scotty ./ +: 1766927040:0;lsa +: 1766927075:0;cd .. +: 1766927076:0;lsa +: 1766927083:0;rm -rf example.cubin +: 1766927084:0;lsa +: 1766927179:0;cd .. +: 1766927184:0;lsa +: 1766927209:0;cd dockerVolumn +: 1766927215:0;lsa +: 1766927218:0;cd cuda-tile +: 1766927219:0;lsa +: 1766927224:0;cd 3rdparty/llvm-project +: 1766927233:0;git status +: 1766927251:0;git switch -c cfbb4cc3 +: 1766927257:0;cd .. +: 1766927258:0;lsa +: 1766927260:0;cd .. +: 1766927260:0;lsa +: 1766927281:0;bash build.sh +: 1766927509:0;cd .. +: 1766927511:0;lsa +: 1766927515:0;mv * ../ +: 1766928047:0;cd map +: 1766928049:0;lsa +: 1766928054:0;chown -R scotty:scotty ./ +: 1766928055:0;lsa +: 1766928060:0;git status +: 1766928065:0;lsa +: 1766928074:0;cd examples +: 1766928075:0;lsa +: 1766928078:0;cd map +: 1766928080:0;lsa +: 1766928085:0;bash build_app.sh +: 1766928088:0;lsa +: 1766928092:0;./example +: 1766928110:0;nvidia-smi +: 1766928158:0;ldconfig -p | grep -i ptxjit +: 1766928184:0;ls -l /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so* 2>/dev/null +: 1766928193:0;echo $LD_LIBRARY_PATH +: 1766928223:0;ldconfig -p | grep -i ptxjit || true +: 1766928228:0;ls -l /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so* 2>/dev/null || true +: 1766928266:0;ls -l /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so* +: 1766928278:0;sudo ln -sf /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1 \\ + /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so +: 1766928289:0;ln -sf /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1 \\ + /usr/lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so +: 1766928295:0;ldconfig +: 1766928303:0;./example +: 1766928325:0;ldconfig -p | grep -i ptxjit +: 1766928402:0;ldconfig -p | grep -E 'libcuda\.so|libnvidia-nvvm|libnvrtc' || true +: 1766928412:0;ls -l /usr/lib/x86_64-linux-gnu/libcuda.so* /lib/x86_64-linux-gnu/libcuda.so* 2>/dev/null || true +: 1766928435:0;lsa /lib/x86_64-linux-gnu/libcuda.so +: 1766928495:0;export LD_LIBRARY_PATH=/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH} +: 1766928499:0;./example +: 1766928602:0;ldd /lib/x86_64-linux-gnu/libnvidia-ptxjitcompiler.so.1 | sed -n '1,200p'\ +ldd /lib/x86_64-linux-gnu/libcuda.so.590.48.01 | sed -n '1,200p' +: 1766928623:0;strace -f -e trace=file -o /tmp/trace.log ./map 2>/dev/null || true +: 1766928630:0;grep -nE 'ptxjit|nvvm|libcuda|nvidia' /tmp/trace.log | tail -n 200 +: 1766928670:0;lsa +: 1766930165:0;cd map +: 1766930166:0;lsa +: 1766930170:0;bash build_app.sh +: 1766930949:0;cd ../map2d +: 1766930952:0;bash build_app.sh +: 1766932113:0;vim ~/.gitconfig +: 1766970861:0;bash build_app.sh +: 1766974775:0;cd examples/matmul +: 1766974777:0;bash build_app.sh +: 1766979197:0;python3 -m pip install numpy +: 1766979205:0;python3 +: 1766979327:0;bash build_app.sh +: 1766982363:0;cd .. +: 1766982364:0;.. +: 1766982369:0;grep -RIn --exclude-dir=.git "pipel|pipeline|peel|prologue|epilogue|modulo" .\ + +: 1766982383:0;grep -RIn --exclude-dir=.git "pass.*pipeline|--pass|addPass|PassPipeline" . +: 1766982412:0;./build/bin/cuda-tile-opt --help +: 1766982641:0;cd examples/matmul +: 1766982644:0;bash build_app.sh +: 1766982734:0;../../build/bin/cuda-tile-opt --help +: 1766986985:0;cd mat +: 1766986987:0;lsa +: 1766986994:0;bash build_app.sh +: 1767007534:0;vim .git/info/exclude +: 1767007865:0;cd examples/CrossTileBlockCommunication +: 1767007868:0;bash build_app.sh +: 1767008901:0;cd .. +: 1767008913:0;mv fp8 tf32 +: 1767008939:0;cd tf32 +: 1767008941:0;bash build_app.sh +: 1767009250:0;cd .. +: 1767009251:0;lsa +: 1767009260:0;mv tf32 bf16 +: 1767009261:0;lsa +: 1767009267:0;chown -R scotty:scotty ./ +: 1767009268:0;lsa +: 1767009362:0;bash build_app.sh +: 1767009366:0;cd bf16 +: 1767009368:0;bash build_app.sh +: 1767010475:0;bash scripts/sync_deps.sh +: 1767010531:0;cd third_party/llvm-project +: 1767010532:0;lsa +: 1767010541:0;git switch -c cfbb4cc3 +: 1767010544:0;cd .. +: 1767010545:0;lsa +: 1767010700:0;git clone https://github.com/Alwaysproblem/cuda-tile +: 1767010732:0;lsa +: 1767010776:0;apt install -yq ccache +: 1767010792:0;apt update -y +: 1767010801:0;apt install -yq ccache +: 1767010916:0;cd .. +: 1767010936:0;bash scripts/make_patch.sh +: 1767011004:0;bash scripts/update.sh +: 1767011070:0;cd Ch6 +: 1767011076:0;cd .. +: 1767011118:0;cp -R Ch6 Ch7 +: 1767011133:0;bash scripts/apply_patch.sh +: 1767011437:0;pwd +: 1767011446:0;cd third_party/cuda-tile +: 1767011448:0;lsa +: 1767011456:0;rg "CUDA_TILE_INSTALL_DIR" +: 1767011492:0;lsa +: 1767011494:0;cd .. +: 1767011494:0;lsa +: 1767011742:0;cd .. +: 1767011743:0;lsa +: 1767011747:0;chown -R scotty:scotty ./ +: 1767011753:0;bash scripts/build_deps.sh +: 1767011910:0;cd third_party/llvm-project +: 1767011914:0;git status +: 1767011935:0;git checkout -q -- . +: 1767011940:0;cd .. +: 1767011973:0;bash scripts/build_deps.sh +: 1767012572:0;git rev-parse HEAD +: 1767012758:0;cd .. +: 1767012762:0;WORKSPACEROOT=`git rev-parse --show-toplevel` || WORKSPACEROOT=`pwd` +: 1767012769:0;WORKSPACEROOT +: 1767012773:0;echo $WORKSPACEROOT +: 1767012787:0;cd .. +: 1767012799:0;WORKSPACEROOT=`git rev-parse --show-toplevel` || WORKSPACEROOT=`pwd` +: 1767012803:0;echo $WORKSPACEROOT +: 1767012866:0;cd cuda-tile +: 1767012868:0;pwd +: 1767012877:0;cd ../MLcompiler-tutorial/mlir/cuda-tile +: 1767012984:0;lsa +: 1767012991:0;bash scripts/build_cuda_tile.sh +: 1767013166:0;lsa +: 1767013174:0;cd third_party/cuda-tile +: 1767013175:0;lsa +: 1767013186:0;pwd +: 1767013202:0;bash scripts/build_cuda_tile.sh +: 1767013654:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm/include +: 1767013690:0;bash scripts/build_cuda_tile.sh +: 1767013790:0;git status +: 1767013797:0;cd third_party/cuda-tile +: 1767013798:0;lsa +: 1767013801:0;git status +: 1767013813:0;git checkout -q -- . +: 1767013816:0;cd .. +: 1767013825:0;pwd +: 1767013834:0;bash scripts/build_cuda_tile.sh +: 1767013877:0;cd third_party/llvm-project +: 1767013880:0;git log +: 1767013943:0;cd .. +: 1767013949:0;rm -rf third_party +: 1767013954:0;lsa +: 1767013962:0;bash scripts/sync_deps.sh +: 1767014037:0;lsa +: 1767014041:0;chown -R scotty:scotty ./ +: 1767014043:0;lsa +: 1767014057:0;rm -rf third_party/llvm-project +: 1767014102:0;cp -R ../../../cuda-tile/3rdparty/llvm-project ./third_party +: 1767014161:0;lsa +: 1767014175:0;cd third_party +: 1767014178:0;git clone https://github.com/Alwaysproblem/cuda-tile +: 1767014184:0;cd .. +: 1767014185:0;lsa +: 1767014204:0;pwd +: 1767014208:0;bash scripts/update.sh +: 1767014248:0;bash scripts/build_deps.sh +: 1767014279:0;cd third_party +: 1767014283:0;cd llvm-project +: 1767014292:0;git status +: 1767014301:0;git checkout -q -- . +: 1767014304:0;cd .. +: 1767014309:0;lsa +: 1767014353:0;bash scripts/update.sh +: 1767014392:0;cd llvm-project +: 1767014397:0;git checkout -q -- . +: 1767014400:0;cd .. +: 1767014403:0;lsa +: 1767014424:0;bash scripts/build_deps.sh +: 1767015527:0;bash scripts/build_cuda_tile.sh +: 1767015541:0;third_party/llvm-project +: 1767015543:0;cd .. +: 1767015544:0;lsa +: 1767015547:0;cd cuda-tile +: 1767015549:0;lsa +: 1767015553:0;git status +: 1767015623:0;git checkout -q -- . +: 1767015677:0;bash scripts/build_cuda_tile.sh +: 1767015684:0;cd ... +: 1767015686:0;lsa +: 1767015688:0;bash scripts/build_cuda_tile.sh +: 1767015733:0;cd third_party +: 1767015734:0;lsa +: 1767015739:0;cd cuda-tile +: 1767015740:0;lsa +: 1767015749:0;ninja install +: 1767015872:0;cd build +: 1767015878:0;cmake --install . +: 1767016239:0;which cmake-format +: 1767016258:0;which clang-tidy +: 1767016671:0;cd .. +: 1767016672:0;.. +: 1767016682:0;find ./third_party -name "mlir-tblgen" +: 1767016905:0;pwd +: 1767017012:0;./build/Toy/toy-cuda sample/example.toy +: 1767017022:0;./build/Toy/toy-cuda sample/example.toy -emit=jit +: 1767017134:0;pwd +: 1767017545:0;git log +: 1767017569:0;git show 3165b0ccf63b333e7b90382b25ce76b8f998ce35 +: 1767017775:0;cd Toy +: 1767017787:0;git apply ../scripts/patch/matmul.patch +: 1767019507:0;僰烺。。 +: 1767019509:0;。。 +: 1767019510:0;.. +: 1767019512:0;lsa +: 1767019516:0;chown -R scotty:scotty ./ +: 1767019518:0;lsa +: 1767019520:0;pwd +: 1767019537:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir +: 1767019560:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767019607:0;lsa +: 1767019656:0;python3 +: 1767019793:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767019810:0;python3 +: 1767020030:0;... +: 1767020033:0;pwd +: 1767020072:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767020252:0;python3 sample/validation.py +: 1767020283:0;sa +: 1767020285:0;lsa +: 1767054330:0;git log +: 1767071940:0;find ./ -name "libmlir_cuda_runtime.* " +: 1767071942:0;find ./ -name "libmlir_cuda_runtime.*" +: 1767072294:0;bash scripts/build_deps.sh +: 1767072320:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project/third_party/llvm-project/llvm +: 1767072341:0;bash scripts/build_deps.sh +: 1767072393:0;nvcc --version +: 1767072402:0;which nvcc +: 1767072464:0;ld --version +: 1767072536:0;which clang +: 1767072558:0;update-alternatives --list +: 1767072563:0;update-alternatives --list lld +: 1767072674:0;update-alternatives --install /usr/bin/ld.lld ld.lld /usr/bin/lldb-20 100 +: 1767072696:0;ld.lld --version +: 1767072705:0;lld --version +: 1767072728:0;bash scripts/build_deps.sh +: 1767072770:0;g++ -fuse-ld=lld -Wl,--version 2>&1 | head -n 5 +: 1767072936:0;bash scripts/build_deps.sh +: 1767072982:0;g++ -fno-use-linker-plugin -fuse-ld=lld -Wl,--version 2>&1 | head -n 20 +: 1767073000:0;update-alternatives --help +: 1767073021:0;update-alternatives --list lld +: 1767073026:0;update-alternatives --list ld.lld +: 1767073033:0;update-alternatives --remove ld.lld +: 1767073045:0;update-alternatives --remove ld.lld /usr/bin/ld.lld +: 1767073050:0;ld.lld --version +: 1767073068:0;which ld.lld +: 1767073084:0;update-alternatives --remove ld.lld +: 1767073089:0;update-alternatives --list ld.lld +: 1767073110:0;rm -rf /usr/bin/ld.lld +: 1767073113:0;update-alternatives --list ld.lld +: 1767073137:0;update-alternatives --remove ld.lld /usr/bin/ld.lld +: 1767073150:0;lldb --version +: 1767073155:0;update-alternatives --list ld.lld +: 1767073188:0;ld --version +: 1767073196:0;update-alternatives --display ld.lld +: 1767073212:0;update-alternatives --remove ld.lld /usr/bin/lldb-20 +: 1767073214:0;update-alternatives --display ld.lld +: 1767077436:0;bash scripts/build_deps.sh +: 1767077457:0;apt-get install -y binutils +: 1767077470:0;bash scripts/build_deps.sh +: 1767077556:0;ld --version +: 1767077622:0;cd third_party +: 1767077626:0;cd cuda-tile +: 1767077628:0;cd .. +: 1767077631:0;lsa +: 1767077697:0;ld.lld +: 1767077720:0;apt install -yq lld +: 1767077777:0;pwd +: 1767077780:0;bash scripts/build_deps.sh +: 1767077838:0;cd ./third_party/llvm-project/build +: 1767077840:0;lsa +: 1767077843:0;grep -R "color-diagnostics" -n build/CMakeCache.txt 2>/dev/null | head -n 20\ +env | grep -E "CFLAGS|CXXFLAGS|LDFLAGS" || true +: 1767077846:0;cd .. +: 1767077848:0;grep -R "color-diagnostics" -n build/CMakeCache.txt 2>/dev/null | head -n 20\ +env | grep -E "CFLAGS|CXXFLAGS|LDFLAGS" || true +: 1767077864:0;lsa +: 1767077881:0;cd third_party +: 1767077882:0;lsa +: 1767077885:0;cd llvm-project +: 1767077886:0;lsa +: 1767077892:0;cd build +: 1767077893:0;ls +: 1767077896:0;cd .. +: 1767077898:0;lsa +: 1767077952:0;cd .. +: 1767077954:0;lsa +: 1767077959:0;rm -rf third_party +: 1767077961:0;cd .. +: 1767077962:0;lsa +: 1767077966:0;cd .. +: 1767077967:0;lsa +: 1767078003:0;bash scripts/build_deps.sh +: 1767078013:0;lsa third_party/llvm-project/build +: 1767078023:0;lsa third_party/llvm +: 1767078096:0;bash scripts/build_deps.sh +: 1767078168:0;cd third_party +: 1767078169:0;lsa +: 1767078172:0;cd cuda-tile +: 1767078173:0;lsa +: 1767078176:0;cd .. +: 1767079230:0;cd third_party +: 1767079233:0;cd cuda-tile +: 1767079237:0;cd .. +: 1767079241:0;bash scripts/build_cuda_tile.sh +: 1767079278:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm/lib/cmake/llvm +: 1767079304:0;lsa +: 1767079308:0;chown -R scotty:scotty ./ +: 1767079311:0;cd third_party +: 1767079314:0;cd llvm +: 1767079315:0;lsa +: 1767079317:0;cd .. +: 1767079320:0;rm -rf llvm +: 1767079323:0;lsa +: 1767079347:0;mv llvm-project/third_party/llvm ./ +: 1767079362:0;rm -rf llvm-project/third_party +: 1767079562:0;cd .. +: 1767079563:0;lsa +: 1767079571:0;bash scripts/build_deps.sh +: 1767079594:0;cd third_party/llvm-project +: 1767079595:0;lsa +: 1767079650:0;bash scripts/build_deps.sh +: 1767079652:0;cd .. +: 1767079656:0;bash scripts/build_deps.sh +: 1767079723:0;cd third_party/llvm-project +: 1767079727:0;... +: 1767079728:0;lsa +: 1767079754:0;bash scripts/build_deps.sh +: 1767079770:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project +: 1767079780:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project/build +: 1767079887:0;lsa +: 1767079893:0;cd third_party/llvm-project +: 1767079894:0;lsa +: 1767079916:0;cd .. +: 1767079919:0;bash scripts/build_deps.sh +: 1767080057:0;ld --version +: 1767080075:0;bash scripts/build_deps.sh +: 1767080192:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/llvm-project/third_party/llvm-project/build +: 1767080196:0;bash scripts/build_deps.sh +: 1767082332:0;bash scripts/build_cuda_tile.sh +: 1767082420:0;find ./ -name "libmlir_cuda_runtime.*" +: 1767082447:0;cd explore +: 1767082448:0;lsa +: 1767082498:0;export MLIR_CUDA_RUNTIME=../../cuda-tile/third_party/llvm/lib/libmlir_cuda_runtime.so +: 1767082510:0;find ../ -name "libmlir_cuda_runtime.*" +: 1767082537:0;find ../ -name "libmlir_runner_utils.*" +: 1767082559:0;export MLIR_RUNNER_UTILS=../third_party/llvm/lib/libmlir_runner_utils.so +: 1767082573:0;mlir-opt vec_add_gpu.mlir \\ + | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin" \\ + | mlir-runner \\ + --shared-libs=$MLIR_CUDA_RUNTIME \\ + --shared-libs=$MLIR_RUNNER_UTILS \\ + --entry-point-result=void +: 1767082681:0;bash run.sh +: 1767082780:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && ls -la && echo '---' && command -v cuda-tile-opt || true && command -v mlir-opt || true && command -v mlir-cpu-runner || true && command -v mlir-runner || true +: 1767082790:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && pwd && ls -უუ && echo '---' && ../third_party/llvm/bin/mlir-opt --version && echo '---' && ../third_party/llvm/bin/mlir-opt --help | grep -E "(print|memref)" | head -n 50 && echo '---' && ../third_party/llvm/bin/mlir-runner --help | head -n 80 +: 1767082798:0;echo TEST && uname -a && ls -la /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore | head +: 1767082874:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && bash -lc 'set -euo pipefail; bash run.sh' +: 1767082911:0;cd /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/explore && ../third_party/llvm/bin/mlir-runner --help | head -n 120 +: 1767083120:0;cd explore +: 1767083123:0;../third_party/llvm/bin/mlir-opt gpu.mlir +: 1767083341:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin" +: 1767083444:0;../third_party/llvm/bin/mlir-opt gpu.mlir \\ + | ../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin" +: 1767083480:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin" gpu.mlir +: 1767084870:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_120" gpu.mlir +: 1767085381:0;../third_party/llvm/bin/mlir-opt gpu.mlir +: 1767085388:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_120" gpu.mlir +: 1767085453:0;bash run.sh +: 1767085885:0;../third_party/llvm/bin/mlir-opt gpu.mlir +: 1767085962:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_120" gpu.mlir +: 1767086084:0;file gpu.mlir\ +head -n 3 gpu.mlir | cat -A\ +xxd -g 1 -l 16 gpu.mlir\ + +: 1767086180:0;../third_party/llvm/bin/mlir-opt --version +: 1767086193:0;cd llvm-project +: 1767086198:0;- +: 1767086233:0;../third_party/llvm/bin/mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-format=bin,cubin-chip=sm_80" gpu.mlir +: 1767086346:0;../third_party/llvm/bin/mlir-opt --gpu-to-llvm gpu.mlir +: 1767086602:0;bash run.sh +: 1767087204:0;../third_party/llvm/bin/mlir-translate example-nvvm.mlir \\ + --mlir-to-llvmir \\ + -o example.ll +: 1767087684:0;../third_party/llvm/bin/mlir-opt --show-dialect +: 1767087688:0;../third_party/llvm/bin/mlir-opt --show-dialects +: 1767087753:0;bash run.sh +: 1767088328:0;../third_party/llvm/bin/mlir-opt --help | grep -n "lower-host-to-llvm" +: 1767088361:0;bash run.sh +: 1767089170:0;history +: 1767089551:0;bash run.sh +: 1767092289:0;../third_party/llvm/bin/mlir-opt --help | grep -E "lower-host-to-llvm|convert-memref-to-llvm|finalize-memref-to-llvm" -n +: 1767092365:0;../third_party/llvm/bin/mlir-opt gpu.mlir \\ + --pass-pipeline="builtin.module(\ + nvvm-attach-target{chip=sm_80 O=3},\ + gpu.module(convert-gpu-to-nvvm),\ + gpu-module-to-binary,\ + lower-host-to-llvm\ + )" \\ + -print-ir-after-all -verify-each 2>&1 | grep -n "unrealized_conversion_cast" | head -n 50 +: 1767092396:0;bash run.sh +: 1767092722:0;../third_party/llvm/bin/mlir-opt --help | grep -nE "(^| )--lower-to-llvm( |$)|lower-to-llvm"\ +../third_party/llvm/bin/mlir-opt --help | grep -nE "convert-func-to-llvm|convert-cf-to-llvm|convert-arith-to-llvm"\ + +: 1767092809:0;bash run.sh +: 1767094172:0;cd llvm-project +: 1767094294:0;- +: 1767094298:0;bash run.sh +: 1767148594:0;cd /usr/local/cuda/lib64 +: 1767148595:0;lsa +: 1767148605:0;cd cmake +: 1767148606:0;lsa +: 1767148614:0;cd libcudacxx +: 1767148615:0;lsa +: 1767148622:0;cat libcudacxx-config.cmake +: 1767152044:0;git stash push +: 1767152135:0;git log +: 1767152965:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767152982:0;./build/Toy/toy-cuda sample/matmul.toy +: 1767153492:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767153543:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir +: 1767153581:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt +: 1767153607:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir-affine +: 1767153730:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767153792:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt +: 1767153800:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir-affine -opt +: 1767153884:0;git stash push +: 1767153914:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767153986:0;git stash pop +: 1767154234:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767154356:0;./build/Toy/toy-cuda sample/matmul.toy -emit=llvm --mlir-print-ir-after-all | tee a.log +: 1767154374:0;./build/Toy/toy-cuda sample/matmul.toy -emit=llvm --mlir-print-ir-after-all 2>&1 | tee a.log +: 1767156049:0;./build/Toy/toy-cuda sample/matmul.toy -emit=jit +: 1767156506:0;git stash pop +: 1767163424:0;./third_party/llvm/bin/mlir-translate --mlir-to-llvmir explore/extern_fun.mlir > fun.ll +: 1767163510:0;clang -O2 fun.ll -Lbuild/Toy -lcuda_shim -lcudart_static -ldl -pthread -o fun +: 1767185426:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir +: 1767186892:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt +: 1767187900:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir +: 1767190577:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt +: 1767233188:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir +: 1767233201:0;./build/Toy/toy-cuda sample/gpu.mlir -emit=mlir +: 1767238309:0;./build/Toy/toy-cuda --help +: 1767238334:0;./build/Toy/toy-cuda --help G grid +: 1767238439:0;./build/Toy/toy-cuda sample/gpu.mlir --grid=1,2,1 +: 1767238499:0;./build/Toy/toy-cuda sample/gpu.mlir +: 1767239646:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu +: 1767239659:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir +: 1767259878:0;apt update -y && apt install -yq gdb +: 1767262830:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir +: 1767262886:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir +: 1767262901:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt +: 1767263012:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir +: 1767263016:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir -opt +: 1767263078:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir +: 1767263439:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir -deubg=toy-gpu-outline +: 1767263450:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --deubg=toy-gpu-outline +: 1767263459:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug=toy-gpu-outline +: 1767263474:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug +: 1767314974:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu +: 1767314983:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug +: 1767315241:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug=toy-gpu-outline +: 1767315260:0;./build/Toy/toy-cuda --help G debug +: 1767315294:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug +: 1767316318:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir +: 1767317299:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1 +: 1767317548:0;git push --force-with-lease +: 1767318062:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1 +: 1767318405:0;./build/Toy/toy-cuda sample/matmul.toy -emit=mlir-affine -opt +: 1767318476:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1 +: 1767318771:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-affine --grid=4,2,1 +: 1767340440:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1 +: 1767340896:0;cd third_party/cuda +: 1767340905:0;find ../ -name "lib*.a" +: 1767340913:0;find ./ -name "lib*.a" +: 1767341975:0;cd .. +: 1767341978:0;cd cuda-tile +: 1767341978:0;lsa +: 1767341985:0;... +: 1767341988:0;chown -R scotty:scotty ./ +: 1767341993:0;- +: 1767341995:0;lsa +: 1767341998:0;cd build +: 1767342000:0;cmake --install . +: 1767342180:0;cd .. +: 1767342185:0;lsa +: 1767342192:0;cd .. +: 1767342231:0;bash scripts/build_cuda_tile.sh +: 1767342672:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/cuda-tile/build/include +: 1767342681:0;lsa /root/Desktop/dockerVolumn/MLcompiler-tutorial/mlir/cuda-tile/third_party/cuda-tile/build/include/cuda_tile/Dialect/CudaTile/IR/Dialect.h +: 1767343260:0;- +: 1767343262:0;.. +: 1767343263:0;lsa +: 1767343268:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --grid=4,2,1 +: 1767343276:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug +: 1767349256:0;cd examples +: 1767349259:0;cd map2d +: 1767349259:0;lsa +: 1767349262:0;bash build_app.sh +: 1767349264:0;lsa +: 1767349282:0;${WORKDIR}/build/bin/cuda-tile-opt example.mlir --mlir-print-ir-after-all -cse +: 1767349292:0;../..//build/bin/cuda-tile-opt example.mlir --mlir-print-ir-after-all -cse +: 1767349432:0;./build/Toy/toy-cuda sample/matmul.toy -emit=gpu-ir --debug +: 1767401976:0;lsa +: 1767401985:0;history +: 1767402311:0;cp ~/.zsh_history vscode/container_zsh_history diff --git a/mlir/cuda-tile/vscode/.initial_container.sh b/mlir/cuda-tile/vscode/.initial_container.sh new file mode 100644 index 0000000..a8555fa --- /dev/null +++ b/mlir/cuda-tile/vscode/.initial_container.sh @@ -0,0 +1,9 @@ +docker run -d --gpus all \ + --privileged -ti \ + --cap-add=SYS_ADMIN --cap-add=SYS_PTRACE \ + --shm-size 4G \ + --ulimit memlock=-1:-1 \ + --security-opt seccomp=unconfined --ipc=host \ + -v $PWD:/work -w /work \ + nvidia/cuda:13.0.0-devel-ubuntu22.04 bash + # bash -lc 'nsys --version && nsys profile --trace=cuda,nvtx,osrt --stats=true -o sysrep ./matmul' diff --git a/mlir/cuda-tile/vscode/.zsh_history b/mlir/cuda-tile/vscode/.zsh_history new file mode 100644 index 0000000..0b08425 --- /dev/null +++ b/mlir/cuda-tile/vscode/.zsh_history @@ -0,0 +1,267 @@ +: 1766810530:0;bash +: 1766822431:0;cat ~/.zshrc +: 1766822464:0;cat ~/.oh-my-zsh/custom/themes/grape.zsh-theme +: 1766822525:0;rm -rf ~/gitstatus +: 1766822564:0;lsa +: 1766830123:0;./example +: 1766830127:0;lsa +: 1766830193:0;mv example example.tilebc Desktop +: 1766830195:0;cd Desktop +: 1766830196:0;lsa +: 1766830203:0;./example +: 1766837019:0;mkdir ~/.ssh +: 1766837033:0;vim ~/ssh/authroized_keys +: 1766837064:0;vim ~/.ssh/authroized_keys +: 1766837086:0;lsa .ssh +: 1766837123:0;chmod 700 ~/.ssh +: 1766837126:0;lsa .ssh +: 1766837160:0;cd .ssh +: 1766837163:0;mv authroized_keys authorized_keys +: 1766837177:0;lsa +: 1766838186:0;cd Desktop +: 1766838187:0;lsa +: 1766838192:0;rm -rf example_with_2inputs +: 1766838217:0;lsa +: 1766896433:0;cd Desktop +: 1766896438:0;cuobjdump example.cubin +: 1766896443:0;example.cubin +: 1766899405:0;vim /etc/systemd/resolved.conf +: 1766899421:0;sudo vim /etc/systemd/resolved.conf +: 1766899453:0;systemctl restart systemd-resolved.service +: 1766899621:0;ping www.google.com +: 1766901388:0;sudo reboot +: 1766901493:0;ping www.google.com +: 1766901518:0;docker pull alwaysproblem/fastdev-u2204:zsh +: 1766901899:0;cd Desktop +: 1766901901:0;lsa +: 1766901944:0;git clone https://github.com/llvm/llvm-project +: 1766902277:0;docker pull nvidia/cuda:13.1.0-devel-ubuntu22.04 +: 1766902571:0;docker pull alwaysproblem/fastdev-u2204:nv13.1.0 +: 1766902588:0;lsa +: 1766902611:0;btm +: 1766902673:0;lsa +: 1766902687:0;mkdir dockerVolumn +: 1766902688:0;lsa +: 1766902725:0;docker run --gpus all --privileged -ti --net=host --shm-size 4G --ulimit memlock=-1:-1 -d -it --name yyx-cuda-ir -v `pwd`:/root/Desktop/dockerVolumn alwaysproblem/fastdev-u2204:nv13.1.0 /bin/bash +: 1766902780:0;nvidia-smi +: 1766902796:0;bash +: 1766902817:0;lsmod | grep -E '^nvidia|nouveau' +: 1766902822:0;sudo dmesg | grep -iE 'nvrm|nvidia|nouveau' | tail -n 80\ + +: 1766902840:0;sudo modprobe nvidia +: 1766902847:0;sudo modprobe nvidia_uvm +: 1766902851:0;sudo modprobe nvidia_drm +: 1766902865:0;mokutil --sb-state +: 1766902876:0;sudo reboot +: 1766902915:0;nvidia-smi +: 1766902941:0;lsmod | grep -E '^nvidia|nouveau'\ + +: 1766902958:0;sudo apt update +: 1766902982:0;sudo apt install -y build-essential dkms linux-headers-$(uname -r) +: 1766902997:0;dkms status +: 1766903014:0;which nvidia-smi +: 1766903019:0;dpkg -l | grep -E 'nvidia|libnvidia' | head +: 1766903031:0;modinfo nvidia | head +: 1766903107:0;nvidia-smi +: 1766903134:0;sudo dmesg | grep -iE 'nvrm|nvidia|nouveau|secure|dkms' | tail -n 120 +: 1766903150:0;nvidia-smi\ + +: 1766903155:0;mokutil --sb-state\ + +: 1766903160:0;lsmod | grep -E '^nvidia|nouveau'\ + +: 1766903168:0;modinfo nvidia || echo "nvidia module not found" +: 1766903175:0;sudo modprobe nvidia || echo "modprobe nvidia failed" +: 1766903186:0;sudo dmesg | grep -iE 'nvrm|nvidia|nouveau|secure|dkms' | tail -n 120 +: 1766903232:0;sudo nvidia-smi +: 1766903398:0;dkms status +: 1766903410:0;sudo dkms autoinstall\ + +: 1766903420:0;sudo dkms build nvidia/590.48.01 -k $(uname -r) +: 1766903448:0;sudo dkms install nvidia/590.48.01 -k $(uname -r) +: 1766903459:0;ls /lib/modules/$(uname -r)/updates/dkms | grep nvidia\ + +: 1766903467:0;sudo modprobe nvidia +: 1766903472:0;sudo modprobe nvidia_uvm +: 1766903477:0;sudo modprobe nvidia_drm +: 1766903482:0;nvidia-smi\ + +: 1766903516:0;cd Desk +: 1766903518:0;lsa +: 1766903525:0;mv llvm-project dockerVolumn +: 1766903529:0;docker run --gpus all --privileged -ti --net=host --shm-size 4G --ulimit memlock=-1:-1 -d -it --name yyx-cuda-ir -v `pwd`:/root/Desktop/dockerVolumn alwaysproblem/fastdev-u2204:nv13.1.0 /bin/bash +: 1766903536:0;docker rm -f /yyx-cuda-ir +: 1766903537:0;docker run --gpus all --privileged -ti --net=host --shm-size 4G --ulimit memlock=-1:-1 -d -it --name yyx-cuda-ir -v `pwd`:/root/Desktop/dockerVolumn alwaysproblem/fastdev-u2204:nv13.1.0 /bin/bash +: 1766924647:0;lsa +: 1766924666:0;git clone https://github.com/NVIDIA/cuda-tile.git +: 1766924671:0;cd cuda-tile +: 1766924672:0;lsa +: 1766924703:0;docker exec -ti yyx-cuda-ir zsh +: 1766926364:0;lsa +: 1766926373:0;cd dockerV +: 1766926379:0;cd Desktop/dockerVolumn +: 1766926381:0;lsa +: 1766926395:0;git clone https://github.com/Alwaysproblem/MLcompiler-tutorial +: 1766926544:0;vim ~/.ssh/authroized_keys +: 1766926555:0;vim ~/.ssh/authorized_keys +: 1766926879:0;cd dockerV +: 1766926880:0;lsa +: 1766927052:0;cd cuda-tile +: 1766927052:0;lsa +: 1766927063:0;cp -R ../llvm-project 3rdparty +: 1766927109:0;cd Desktop/dockerVolumn +: 1766927110:0;lsa +: 1766927113:0;cd .. +: 1766927114:0;lsa +: 1766927171:0;rm -rf cuda-tile/ llvm-project/ MLcompiler-tutorial +: 1766927174:0;lsa +: 1766928551:0;cd examples +: 1766928551:0;lsa +: 1766928554:0;cd map +: 1766928554:0;lsa +: 1766928558:0;./example +: 1766928677:0;lsa +: 1766928680:0;./example +: 1766931383:0;cd ../map2d +: 1766931387:0;./example +: 1766932138:0;git push +: 1766971092:0;cd map2d +: 1766971096:0;./example +: 1766978487:0;cd ../matmul +: 1766978489:0;./example +: 1766983673:0;ls /usr/local/cuda/bin | grep -E 'nsys|ncu' +: 1766983777:0;/usr/local/cuda/bin/ncu --set full --target-processes all -o matmul_kernel ./example +: 1766983786:0;lsa +: 1766983797:0;sudo chown -R cheng:cheng ./ +: 1766983806:0;/usr/local/cuda/bin/ncu --set full --target-processes all -o matmul_kernel ./example +: 1766983821:0;lsa +: 1766983914:0;/usr/local/cuda/bin/nsys profile --trace=cuda --stats=true -o sysrep ./example.cpp +: 1766983925:0;/usr/local/cuda/bin/nsys profile --trace=cuda --stats=true -o sysrep ./example +: 1766983939:0;lsa +: 1766984070:0;nvidia-smi -l 1\ + +: 1766984097:0;ldd ./matmul | grep -E 'libcuda|libcudart|libnvrtc' || true +: 1766984104:0;ldd ./example | grep -E 'libcuda|libcudart|libnvrtc' || true +: 1766984126:0;strace -f -e trace=openat,access -o /tmp/trace.log ./matmul 2>/dev/null || true +: 1766984132:0;strace -f -e trace=openat,access -o /tmp/trace.log ./example 2>/dev/null || true +: 1766984141:0;grep -nE 'libcuda|libcudart|nvidia|/dev/nvidia|nvrtc|ptxjit' /tmp/trace.log | head -n 80 +: 1766984195:0;nsys --version +: 1766984210:0;ls -l /usr/local/cuda/extras/CUPTI/lib64/libcupti.so* 2>/dev/null || true +: 1766984218:0;ldconfig -p | grep -i cupti || true +: 1766984299:0;sudo apt install -y nsight-systems +: 1766984319:0;sudo apt install -y nsight-systems-2025.5.2 +: 1766984348:0;nsys --version +: 1766984370:0;nsys profile --trace=cuda --stats=true -o sysrep ./example +: 1766984516:0;apt-cache search nsight | head -n 50 +: 1766984595:0;vim c/.zshrc +: 1766984608:0;vim ~/.zshrc +: 1766984633:0;ncu --version +: 1766984673:0;nsys --version +: 1766984840:0;nvidia-smi +: 1766984958:0;lsa +: 1766984965:0;ps -a G nv +: 1766984969:0;ps -a G example +: 1766984974:0;nvidia-smi +: 1766985007:0;sudo reboot +: 1766985235:0;nvidia-smi +: 1766985298:0;lsa +: 1766985304:0;cd matmul +: 1766985305:0;lsa +: 1766985308:0;docker image ls +: 1766985323:0;docker ps -a +: 1766985329:0;docker start yyx-cuda-ir +: 1766985334:0;lsa +: 1766985344:0;rm -rf sysrep.* +: 1766985347:0;lsa +: 1766985354:0;vim ~/.zshrc +: 1766985383:0;apt install nsight-compute +: 1766985393:0;sudo apt install -y nsight-compute +: 1766985407:0;sudo apt install -y nsight-compute-2025.4.0 +: 1766985464:0;ncu --set full --target-processes all -o matmul_kernel ./example +: 1766985493:0;lsa +: 1766985521:0;sudo nvidia-smi -pm 1 +: 1766985527:0;sudo nvidia-smi -i 0 -rgc +: 1766985532:0;sudo nvidia-smi -i 0 -rci\ + +: 1766985552:0;sudo tee /etc/modprobe.d/nvidia-prof.conf << 'EOF'\ +options nvidia NVreg_RestrictProfilingToAdminUsers=0\ +EOF +: 1766985559:0;sudo update-initramfs -u +: 1766985608:0;sudo reboot +: 1766985646:0;cd matmul +: 1766985664:0;ncu --set full --target-processes all -o matmul_kernel ./example +: 1766985677:0;nvidia-smi +: 1766985697:0;sudo apt install -y nsight-compute-2025.4.0 +: 1766985752:0;zsh: command not found: ncu +: 1766985759:0;sudo find / -type f -name ncu -o -name ncu-cli 2>/dev/null | head -n 50 +: 1766985817:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example +: 1766985837:0;vim ~/.zshrc +: 1766985849:0;lsa +: 1766987050:0;sudo chown -R cheng:cheng ./ +: 1766987054:0;lsa +: 1766987067:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example +: 1766987073:0;lsa +: 1766987090:0;mv matmul_kernel.ncu-rep matmul_kernel.ncu-rep.baseline +: 1766987092:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example +: 1766987987:0;mv matmul_kernel.ncu-rep matmul_kernel.ncu-rep.token +: 1766987988:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel ./example +: 1766988180:0;/opt/nvidia/nsight-compute/2025.4.0/ncu --set full --target-processes all -o matmul_kernel -f ./example +: 1767007315:0;lsa +: 1767007318:0;sudo chown -R cheng:cheng ./ +: 1767007349:0;git statsu +: 1767007352:0;git status +: 1767007879:0;cd ../CrossTileBlockCommunication +: 1767007882:0;./example +: 1767008718:0;lsa +: 1767008721:0;cd .. +: 1767008723:0;sudo chown -R cheng:cheng ./ +: 1767009097:0;cd tf32 +: 1767009099:0;./example +: 1767009274:0;lsa +: 1767009280:0;./example +: 1767009377:0;cd .. +: 1767009379:0;cd bf16 +: 1767009381:0;lsa +: 1767009383:0;./example +: 1767010202:0;lsa +: 1767010207:0;.. +: 1767010208:0;lsa +: 1767010210:0;c... +: 1767010213:0;... +: 1767010214:0;lsa +: 1767088913:0;cd cuda-tile +: 1767088916:0;cd .. +: 1767088919:0;cd dockerVolumn +: 1767088923:0;cd .. +: 1767088924:0;lsa +: 1767088933:0;cd cuda-tile +: 1767088934:0;lsa +: 1767088936:0;cd .. +: 1767088937:0;cd MLcompiler-tutorial +: 1767088942:0;cd mlir/cuda-tile +: 1767089068:0;../third_party/llvm/bin/mlir-runner example-nvvm.mlir \\ + --entry-point-result=void \\ + --shared-libs=${MLIR_RUNNER_UTILS} \\ + --shared-libs=${MLIR_CUDA_RUNTIME} +: 1767089081:0;cd explore +: 1767089083:0;export MLIR_RUNNER_UTILS=`pwd`/../third_party/llvm/lib/libmlir_runner_utils.so\ +export MLIR_CUDA_RUNTIME=`pwd`/../third_party/llvm/lib/libmlir_cuda_runtime.so +: 1767089086:0;../third_party/llvm/bin/mlir-runner example-nvvm.mlir \\ + --entry-point-result=void \\ + --shared-libs=${MLIR_RUNNER_UTILS} \\ + --shared-libs=${MLIR_CUDA_RUNTIME} +: 1767089105:0;apt install g++13 +: 1767089123:0;sudo apt update -y && apt install g++-13 gcc-13 +: 1767089152:0;sudo apt update -y && sudo apt install g++-13 gcc-13 +: 1767089182:0;sudo apt install -yq software-properties-common +: 1767089214:0;sudo apt update -y && sudo apt install g++-13 gcc-13 +: 1767089237:0;sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test +: 1767089432:0;sudo apt update -y && sudo apt install g++-13 gcc-13 +: 1767089478:0;../third_party/llvm/bin/mlir-runner example-nvvm.mlir \\ + --entry-point-result=void \\ + --shared-libs=${MLIR_RUNNER_UTILS} \\ + --shared-libs=${MLIR_CUDA_RUNTIME} +: 1767401874:0;lsa +: 1767401877:0;docker image ls +: 1767402034:0;history +: 1767402502:0;cp ~/.zsh_history ~/Desktop/MLcompiler-tutorial/mlir/cuda-tile/vscode diff --git a/mlir/cuda-tile/vscode/c_cpp_properties.json b/mlir/cuda-tile/vscode/c_cpp_properties.json new file mode 100644 index 0000000..6cdddd8 --- /dev/null +++ b/mlir/cuda-tile/vscode/c_cpp_properties.json @@ -0,0 +1,17 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [], + "defines": [], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c11", + "cppStandard": "c++17", + "intelliSenseMode": "linux-gcc-x64", + "configurationProvider": "ms-vscode.cmake-tools", + "mergeConfigurations": true, + "compileCommands": "${workspaceFolder}/build/compile_commands.json" + } + ], + "version": 4 +} diff --git a/mlir/cuda-tile/vscode/cmake-kits.json b/mlir/cuda-tile/vscode/cmake-kits.json new file mode 100644 index 0000000..da177bf --- /dev/null +++ b/mlir/cuda-tile/vscode/cmake-kits.json @@ -0,0 +1,13 @@ +[ + { + "name": "GCC in conda", + "compilers": { + "C": "/usr/bin/gcc", + "CXX": "/usr/bin/g++" + }, + "environmentSetupScript": "${workspaceFolder}/.envsetup.sh", + "preferredGenerator": { + "name": "Ninja" + } + } +] diff --git a/mlir/cuda-tile/vscode/launch.json b/mlir/cuda-tile/vscode/launch.json new file mode 100644 index 0000000..be1aeed --- /dev/null +++ b/mlir/cuda-tile/vscode/launch.json @@ -0,0 +1,33 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "(gdb) 启动", + "type": "cppdbg", + "request": "launch", + "program": "${command:cmake.launchTargetPath}", + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "miDebuggerPath": "/usr/bin/gdb", + "setupCommands": [ + { + "description": "为 gdb 启用整齐打印", + "text": "-enable-pretty-printing", + "ignoreFailures": true + }, + { + "description": "将反汇编风格设置为 Intel", + "text": "-gdb-set disassembly-flavor intel", + "ignoreFailures": true + } + ] + } + ] +} diff --git a/mlir/cuda-tile/vscode/settings.json b/mlir/cuda-tile/vscode/settings.json new file mode 100644 index 0000000..58237aa --- /dev/null +++ b/mlir/cuda-tile/vscode/settings.json @@ -0,0 +1,118 @@ +{ + "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", + "cmake.debugConfig": { + "cwd": "${workspaceFolder}", + "args": [ + ] + }, + + "cmake.cmakePath": "/root/miniconda3/envs/mlir/bin/cmake", + "files.associations": { + "*.py": "python", + "*.mmd": "mermaid", + "*.dockfile": "dockerfile", + ".style.yapf": "ini", + "*.inc": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "cinttypes": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "complex": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "future": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "ostream": "cpp", + "shared_mutex": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "typeinfo": "cpp", + "variant": "cpp", + "compare": "cpp", + "concepts": "cpp", + "numbers": "cpp", + "semaphore": "cpp", + "stop_token": "cpp", + "any": "cpp", + "executor": "cpp", + "netfwd": "cpp" + }, + "cmake.configureArgs": [ + "-Wno-dev", + + // This is for non-conda users. + // "-DMLIR_DIR=${workspaceFolder}/third_party/llvm/lib/cmake/mlir", + // "-DLLVM_DIR=${workspaceFolder}/third_party/llvm/lib/cmake/llvm", + // "-DMHLO_DIR=${workspaceFolder}/third_party/mhlo/lib/cmake/mlir-hlo", + // "-DCMAKE_MODULE_PATH=${workspaceFolder}/third_party/llvm/lib/cmake/mlir;${workspaceFolder}/third_party/llvm/lib/cmake/llvm;${workspaceFolder}/third_party/mhlo/lib/cmake/mlir-hlo", + // "-DMLIR_TABLEGEN_EXE=${workspaceFolder}/third_party/llvm/bin/mlir-tblgen", + + // This is for conda users. + "-DMLIR_TABLEGEN_EXEUTABLE:FILEPATH=/root/miniconda3/envs/mlir/bin/mlir-tblgen", + "-DCMAKE_MODULE_PATH=/root/miniconda3/envs/mlir/lib/cmake/mlir;/root/miniconda3/envs/mlir/lib/cmake/llvm", + ], + // "cmake.environment": { + // "LD_LIBRARY_PATH": "/root/miniconda3/envs/mlir/x86_64-conda-linux-gnu/lib:${env.LD_LIBRARY_PATH}" + // }, + "C_Cpp.clang_format_path": "${env.HOME}/miniconda3/envs/mlir/bin/clang-format", + "C_Cpp.codeAnalysis.clangTidy.path": "${env.HOME}/miniconda3/envs/mlir/bin/clang-tidy", + "cmakeFormat.exePath": "/root/miniconda3/envs/mlir/bin/cmake-format", + "C_Cpp.errorSquiggles": "enabled", + "C_Cpp.clang_format_sortIncludes": true, + "C_Cpp.codeAnalysis.clangTidy.enabled": true, + "C_Cpp.codeAnalysis.clangTidy.codeAction.formatFixes": true, + "C_Cpp.codeAnalysis.clangTidy.useBuildPath": true, + "C_Cpp.codeAnalysis.clangTidy.args": [ + "-p", + "${workspaceFolder}/build/compile_commands.json" + ], + "cmakeFormat.args": ["--config=${workspaceFolder}/.cmake-lint.yaml"], +}