mikex86
diff --git a/‎.gitmodules‎
Lines changed: 12 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 91 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 42 additions & 0 deletions b/‎README.md‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎add_kernel.ttir‎
Lines changed: 26 additions & 0 deletions b/‎add_kernel.ttir‎
Lines changed: 26 additions & 0 deletions
@@ -0,0 +1,12 @@
+[submodule "third_party/triton"]
+	path = third_party/triton
+	url = https://github.com/triton-lang/triton
+[submodule "third_party/llvm-project"]
+	path = third_party/llvm-project
+	url = https://github.com/llvm/llvm-project
+[submodule "third_party/pybind11"]
+	path = third_party/pybind11
+	url = https://github.com/pybind/pybind11
+[submodule "third_party/argparse"]
+	path = third_party/argparse
+	url = https://github.com/p-ranav/argparse
@@ -0,0 +1,91 @@
+cmake_minimum_required(VERSION 3.28)
+
+project(tritonc CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+
+set(TRITON_CODEGEN_BACKENDS "nvidia")
+set(TRITON_BUILD_PYTHON_MODULE ON)
+set(TRITON_BUILD_PROTON OFF)
+set(PYTHON_INCLUDE_DIRS ON)
+
+add_subdirectory(third_party/argparse)
+add_subdirectory(third_party/triton)
+
+add_executable(tritonc src/main.cpp)
+target_link_libraries(tritonc PRIVATE argparse)
+
+get_property(triton_libs GLOBAL PROPERTY TRITON_LIBS)
+
+set(TRITON_LIBRARIES
+        ${triton_libs}
+
+        # mlir
+        MLIRAMDGPUDialect
+        MLIRNVVMDialect
+        MLIRNVVMToLLVMIRTranslation
+        MLIRGPUToNVVMTransforms
+        MLIRGPUToGPURuntimeTransforms
+        MLIRGPUTransforms
+        MLIRIR
+        MLIRControlFlowToLLVM
+        MLIRBytecodeWriter
+        MLIRPass
+        MLIRTransforms
+        MLIRLLVMDialect
+        MLIRSupport
+        MLIRTargetLLVMIRExport
+        MLIRMathToLLVM
+        MLIRROCDLToLLVMIRTranslation
+        MLIRGPUDialect
+        MLIRSCFToControlFlow
+        MLIRIndexToLLVM
+        MLIRGPUToROCDLTransforms
+
+        # LLVM
+        LLVMPasses
+        LLVMNVPTXCodeGen
+        # LLVMNVPTXAsmPrinter
+        LLVMAMDGPUCodeGen
+        LLVMAMDGPUAsmParser
+
+        # Nvidia specific
+        TritonNVIDIAGPUToLLVM NVGPUToLLVM MLIRNVGPUToNVVM
+        MLIRNVVMDialect MLIRNVGPUDialect
+
+)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR # Linux arm64
+        CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")     # macOS arm64
+    list(APPEND TRITON_LIBRARIES
+            LLVMAArch64CodeGen
+            LLVMAArch64AsmParser
+    )
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    list(APPEND TRITON_LIBRARIES
+            LLVMX86CodeGen
+            LLVMX86AsmParser
+    )
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+    list(APPEND TRITON_LIBRARIES
+            LLVMPowerPCAsmParser
+            LLVMPowerPCCodeGen
+    )
+else ()
+    message(FATAL_ERROR "LLVM codegen/ASM parser libs: This HW architecture (${CMAKE_SYSTEM_PROCESSOR}) is not configured in cmake lib dependencies.")
+endif ()
+
+target_link_libraries(tritonc PRIVATE ${TRITON_LIBRARIES})
+
+# triton doesn't use targets in the best practise way... (sigh)
+# hence we have to do this...
+set(TRITON_INCLUDE_DIRS
+        "third_party/triton/include"
+        "third_party/triton/third_party/nvidia/include"
+)
+set(TRITON_GENERATED_INCLUDE_DIRS
+        "${CMAKE_BINARY_DIR}/third_party/triton/include"
+        "${CMAKE_BINARY_DIR}/third_party/triton/third_party"
+)
+
+target_include_directories(tritonc PRIVATE ${TRITON_INCLUDE_DIRS})
+target_include_directories(tritonc PRIVATE ${TRITON_GENERATED_INCLUDE_DIRS})
@@ -0,0 +1,42 @@
+# tritonc
+
+Your standalone commandline triton compiler.
+Write your triton kernels directly in MLIR and compile it to ptx with this handy tool without ever touching python.
+
+## Example:
+
+### add_kernel.ttir
+
+```mlir
+module {
+  tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                             %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                             %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                             %arg3: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : i32 -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg3 : i32 -> tensor<1024xi32>
+    %6 = arith.cmpi slt, %4, %5 : tensor<1024xi32>
+    %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>>
+    %10 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>>
+    %13 = arith.addf %9, %12 : tensor<1024xf32>
+    %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>>
+    tt.return
+  }
+}
+```
+
+### Commandline
+```commandline
+tritonc add_kernel.ttir --compute-capability 89 --num-stages 3 --num-warps 4 -o out.ptx
+```
@@ -0,0 +1,26 @@
+module {
+  tt.func public @add_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                             %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                             %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32},
+                             %arg3: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32
+    %0 = tt.get_program_id x : i32
+    %1 = arith.muli %0, %c1024_i32 : i32
+    %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32>
+    %3 = tt.splat %1 : i32 -> tensor<1024xi32>
+    %4 = arith.addi %3, %2 : tensor<1024xi32>
+    %5 = tt.splat %arg3 : i32 -> tensor<1024xi32>
+    %6 = arith.cmpi slt, %4, %5 : tensor<1024xi32>
+    %7 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %8 = tt.addptr %7, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    %9 = tt.load %8, %6 : tensor<1024x!tt.ptr<f32>>
+    %10 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %11 = tt.addptr %10, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    %12 = tt.load %11, %6 : tensor<1024x!tt.ptr<f32>>
+    %13 = arith.addf %9, %12 : tensor<1024xf32>
+    %14 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>>
+    %15 = tt.addptr %14, %4 : tensor<1024x!tt.ptr<f32>>, tensor<1024xi32>
+    tt.store %15, %13, %6 : tensor<1024x!tt.ptr<f32>>
+    tt.return
+  }
+}