diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
index 48321459..33eaa498 100644
--- a/external/CMakeLists.txt
+++ b/external/CMakeLists.txt
@@ -19,7 +19,7 @@ endmacro()
 # d3d12
 
 add_library(d3d12 INTERFACE)
-target_link_libraries(d3d12 INTERFACE dxgi.lib d3d12.lib)
+target_link_libraries(d3d12 INTERFACE dxgi.lib d3d12.lib dxguid.lib)
 
 # nanobind
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 84ce7045..ba9c9604 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -91,6 +91,7 @@ target_sources(sgl PRIVATE
     sgl/device/framebuffer.h
     sgl/device/fwd.h
     sgl/device/helpers.h
+    sgl/device/helpers.cpp
     sgl/device/input_layout.cpp
     sgl/device/input_layout.h
     sgl/device/kernel.cpp
diff --git a/src/sgl/device/helpers.cpp b/src/sgl/device/helpers.cpp
new file mode 100644
index 00000000..3ce2c1da
--- /dev/null
+++ b/src/sgl/device/helpers.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+#include "helpers.h"
+
+#include "sgl/core/config.h"
+#include "sgl/core/macros.h"
+
+#include <string>
+#include <format>
+
+#if SGL_HAS_D3D12
+#include <dxgidebug.h>
+#include <dxgi1_3.h>
+#endif
+
+namespace sgl {
+
+
+// Reads last error from graphics layer.
+std::string get_last_gfx_layer_error()
+{
+#if SGL_HAS_D3D12
+    IDXGIDebug* dxgiDebug = nullptr;
+    DXGIGetDebugInterface1(0, IID_PPV_ARGS(&dxgiDebug));
+    if (!dxgiDebug)
+        return "";
+
+    IDXGIInfoQueue* dxgiInfoQueue = nullptr;
+    dxgiDebug->QueryInterface(IID_PPV_ARGS(&dxgiInfoQueue));
+    if (!dxgiInfoQueue)
+        return "";
+
+    UINT64 messageCount = dxgiInfoQueue->GetNumStoredMessages(DXGI_DEBUG_ALL);
+    if (messageCount == 0)
+        return "";
+
+    SIZE_T messageLength = 0;
+    dxgiInfoQueue->GetMessage(DXGI_DEBUG_ALL, messageCount - 1, nullptr, &messageLength);
+    DXGI_INFO_QUEUE_MESSAGE* pMessage = (DXGI_INFO_QUEUE_MESSAGE*)malloc(messageLength);
+    dxgiInfoQueue->GetMessage(DXGI_DEBUG_ALL, messageCount - 1, pMessage, &messageLength);
+    auto res = std::string(pMessage->pDescription);
+    free(pMessage);
+    return res;
+#else
+    // TODO: Get useful error information for other platforms if possible
+    return "";
+#endif
+}
+
+// Builds the user friendly message that is passed into a slang failure exception,
+// used by SLANG_CALL.
+std::string build_slang_failed_message(const char* call, SlangResult result)
+{
+    auto msg = std::format("Slang call {} failed with error: {}\n", call, result);
+    if (static_cast<uint32_t>(result) >= 0x80000000U) {
+        std::string gfx_error = get_last_gfx_layer_error();
+        if (!gfx_error.empty()) {
+            msg += "\nLast graphics layer error:\n" + gfx_error;
+        }
+    }
+    return msg;
+}
+
+} // namespace sgl
diff --git a/src/sgl/device/helpers.h b/src/sgl/device/helpers.h
index 6b80bcbd..d515d029 100644
--- a/src/sgl/device/helpers.h
+++ b/src/sgl/device/helpers.h
@@ -5,10 +5,17 @@
 #include "sgl/core/error.h"
 
 #include <slang-gfx.h>
+#include <format>
+
+
+namespace sgl {
+SGL_API std::string build_slang_failed_message(const char* call, SlangResult result);
+}
 
 #define SLANG_CALL(call)                                                                                               \
     {                                                                                                                  \
         SlangResult result_ = call;                                                                                    \
-        if (SLANG_FAILED(result_))                                                                                     \
-            SGL_THROW("Slang call {} failed with error: {}", #call, result_);                                          \
+        if (SLANG_FAILED(result_)) {                                                                                   \
+            SGL_THROW(build_slang_failed_message(#call, result_));                                                     \
+        }                                                                                                              \
     }
diff --git a/src/sgl/device/python/types.cpp b/src/sgl/device/python/types.cpp
index 42d5c5e0..39b9a19b 100644
--- a/src/sgl/device/python/types.cpp
+++ b/src/sgl/device/python/types.cpp
@@ -48,8 +48,22 @@ SGL_DICT_TO_DESC_FIELD(enable_conservative_rasterization, bool)
 SGL_DICT_TO_DESC_FIELD(forced_sample_count, uint32_t)
 SGL_DICT_TO_DESC_END()
 
+SGL_DICT_TO_DESC_BEGIN(AspectBlendDesc)
+SGL_DICT_TO_DESC_FIELD(src_factor, BlendFactor)
+SGL_DICT_TO_DESC_FIELD(dst_factor, BlendFactor)
+SGL_DICT_TO_DESC_FIELD(op, BlendOp)
+SGL_DICT_TO_DESC_END()
+
+SGL_DICT_TO_DESC_BEGIN(TargetBlendDesc)
+SGL_DICT_TO_DESC_FIELD(enable_blend, bool)
+SGL_DICT_TO_DESC_FIELD_DICT(color, AspectBlendDesc)
+SGL_DICT_TO_DESC_FIELD_DICT(alpha, AspectBlendDesc)
+SGL_DICT_TO_DESC_FIELD(logic_op, LogicOp)
+SGL_DICT_TO_DESC_FIELD(write_mask, RenderTargetWriteMask)
+SGL_DICT_TO_DESC_END()
+
 SGL_DICT_TO_DESC_BEGIN(BlendDesc)
-SGL_DICT_TO_DESC_FIELD(targets, std::vector<TargetBlendDesc>)
+SGL_DICT_TO_DESC_FIELD_LIST(targets, TargetBlendDesc)
 SGL_DICT_TO_DESC_FIELD(alpha_to_coverage_enable, bool)
 SGL_DICT_TO_DESC_END()
 
@@ -178,12 +192,20 @@ SGL_PY_EXPORT(device_types)
 
     nb::class_<AspectBlendDesc>(m, "AspectBlendDesc", D(AspectBlendDesc))
         .def(nb::init<>())
+        .def(
+            "__init__",
+            [](AspectBlendDesc* self, nb::dict dict) { new (self) AspectBlendDesc(dict_to_AspectBlendDesc(dict)); }
+        )
         .def_rw("src_factor", &AspectBlendDesc::src_factor, D(AspectBlendDesc, src_factor))
         .def_rw("dst_factor", &AspectBlendDesc::dst_factor, D(AspectBlendDesc, dst_factor))
         .def_rw("op", &AspectBlendDesc::op, D(AspectBlendDesc, op));
 
     nb::class_<TargetBlendDesc>(m, "TargetBlendDesc", D(TargetBlendDesc))
         .def(nb::init<>())
+        .def(
+            "__init__",
+            [](TargetBlendDesc* self, nb::dict dict) { new (self) TargetBlendDesc(dict_to_TargetBlendDesc(dict)); }
+        )
         .def_rw("color", &TargetBlendDesc::color, D(TargetBlendDesc, color))
         .def_rw("alpha", &TargetBlendDesc::alpha, D(TargetBlendDesc, alpha))
         .def_rw("enable_blend", &TargetBlendDesc::enable_blend, D(TargetBlendDesc, enable_blend))
@@ -192,6 +214,7 @@ SGL_PY_EXPORT(device_types)
 
     nb::class_<BlendDesc>(m, "BlendDesc", D(BlendDesc))
         .def(nb::init<>())
+        .def("__init__", [](BlendDesc* self, nb::dict dict) { new (self) BlendDesc(dict_to_BlendDesc(dict)); })
         .def_rw("targets", &BlendDesc::targets, D(BlendDesc, targets))
         .def_rw(
             "alpha_to_coverage_enable",
diff --git a/src/sgl/device/tests/test_pipeline.py b/src/sgl/device/tests/test_pipeline.py
new file mode 100644
index 00000000..53549c11
--- /dev/null
+++ b/src/sgl/device/tests/test_pipeline.py
@@ -0,0 +1,801 @@
+import sgl
+import pytest
+import numpy as np
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).parent))
+import helpers
+
+
+class PipelineTestContext:
+    def __init__(self, device_type, size=128) -> None:
+        self.device = helpers.get_device(type=device_type)
+        self.output_texture = self.device.create_texture(
+            format=sgl.Format.rgba32_float,
+            width=size,
+            height=size,
+            usage=sgl.ResourceUsage.unordered_access
+            | sgl.ResourceUsage.shader_resource
+            | sgl.ResourceUsage.render_target,
+            debug_name="render_texture",
+        )
+        self.count_buffer = self.device.create_buffer(
+            usage=sgl.ResourceUsage.unordered_access
+            | sgl.ResourceUsage.shader_resource,
+            size=16,
+            debug_name="count_buffer",
+            data=np.array([0, 0, 0, 0], dtype=np.uint32),
+        )
+
+        self.clear_kernel = self.device.create_compute_kernel(
+            self.device.load_program("test_pipeline_utils.slang", ["clear"])
+        )
+        self.count_kernel = self.device.create_compute_kernel(
+            self.device.load_program("test_pipeline_utils.slang", ["count"])
+        )
+
+        self.clear()
+
+    def clear(self):
+        self.clear_kernel.dispatch(
+            thread_count=[self.output_texture.width, self.output_texture.height, 1],
+            render_texture=self.output_texture,
+        )
+
+    def count(self):
+        self.count_buffer.from_numpy(np.array([0, 0, 0, 0], dtype=np.uint32))
+        self.count_kernel.dispatch(
+            thread_count=[self.output_texture.width, self.output_texture.height, 1],
+            render_texture=self.output_texture,
+            count_buffer=self.count_buffer,
+        )
+
+    def expect_counts(self, expected):
+        self.count()
+        count = self.count_buffer.to_numpy().view(np.uint32)
+        assert np.all(count == expected)
+
+    def create_quad_mesh(self):
+        vertices = np.array(
+            [-1, -1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1], dtype=np.float32
+        )
+        indices = np.array([0, 1, 2, 1, 3, 2], dtype=np.uint32)
+
+        vertex_buffer = self.device.create_buffer(
+            usage=sgl.ResourceUsage.shader_resource,
+            debug_name="vertex_buffer",
+            data=vertices,
+        )
+        input_layout = self.device.create_input_layout(
+            input_elements=[
+                {
+                    "semantic_name": "POSITION",
+                    "semantic_index": 0,
+                    "format": sgl.Format.rgb32_float,
+                    "offset": 0,
+                },
+            ],
+            vertex_streams=[{"stride": 12}],
+        )
+        index_buffer = self.device.create_buffer(
+            usage=sgl.ResourceUsage.shader_resource,
+            debug_name="index_buffer",
+            data=indices,
+        )
+
+        return vertex_buffer, index_buffer, input_layout
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_clear_and_count(device_type):
+    ctx = PipelineTestContext(device_type)
+    ctx.expect_counts([0, 0, 0, 0])
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_compute_set_square(device_type):
+    ctx = PipelineTestContext(device_type)
+    prog = ctx.device.load_program("test_pipeline_utils.slang", ["setcolor"])
+    set_kernel = ctx.device.create_compute_kernel(prog)
+
+    pos = sgl.int2(32, 32)
+    size = sgl.int2(16, 16)
+    set_kernel.dispatch(
+        thread_count=[ctx.output_texture.width, ctx.output_texture.height, 1],
+        render_texture=ctx.output_texture,
+        pos=pos,
+        size=size,
+        color=sgl.float4(1, 0, 0, 1),
+    )
+
+    area = size.x * size.y
+    ctx.expect_counts([area, 0, 0, area])
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_compute_set_and_overwrite(device_type):
+    ctx = PipelineTestContext(device_type)
+    prog = ctx.device.load_program("test_pipeline_utils.slang", ["setcolor"])
+    set_kernel = ctx.device.create_compute_kernel(prog)
+
+    pos1 = sgl.int2(0, 0)
+    size1 = sgl.int2(128, 128)
+    set_kernel.dispatch(
+        thread_count=[ctx.output_texture.width, ctx.output_texture.height, 1],
+        render_texture=ctx.output_texture,
+        pos=pos1,
+        size=size1,
+        color=sgl.float4(1, 0, 0, 0),
+    )
+
+    pos2 = sgl.int2(32, 32)
+    size2 = sgl.int2(16, 16)
+    set_kernel.dispatch(
+        thread_count=[ctx.output_texture.width, ctx.output_texture.height, 1],
+        render_texture=ctx.output_texture,
+        pos=pos2,
+        size=size2,
+        color=sgl.float4(0, 1, 0, 0),
+    )
+
+    area1 = size1.x * size1.y
+    area2 = size2.x * size2.y
+    ctx.expect_counts([area1 - area2, area2, 0, 0])
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_gfx_clear(device_type):
+    ctx = PipelineTestContext(device_type)
+
+    command_buffer = ctx.device.create_command_buffer()
+    command_buffer.clear_resource_view(
+        ctx.output_texture.get_rtv(), [1.0, 0.0, 1.0, 0.0]
+    )
+    command_buffer.submit()
+
+    area = ctx.output_texture.width * ctx.output_texture.height
+
+    ctx.expect_counts([area, 0, area, 0])
+
+
+class GfxContext:
+    def __init__(self, ctx: PipelineTestContext) -> None:
+        self.ctx = ctx
+        self.program = ctx.device.load_program(
+            "test_pipeline_utils.slang", ["vertex_main", "fragment_main"]
+        )
+        self.vertex_buffer, self.index_buffer, self.input_layout = (
+            ctx.create_quad_mesh()
+        )
+        self.framebuffer = ctx.device.create_framebuffer(
+            render_targets=[ctx.output_texture.get_rtv()]
+        )
+
+    # Draw a quad with the given pipeline and color, optionally clearing to black first.
+    # The quad is [-1,-1]->[1,1] so if offset/scale aren't specified will fill the whole screen.
+    def draw(
+        self,
+        pipeline: sgl.Pipeline,
+        vert_offset=sgl.float2(0, 0),
+        vert_scale=sgl.float2(1, 1),
+        vert_z=0.0,
+        color=sgl.float4(0, 0, 0, 0),
+        viewport: sgl.Viewport = None,
+        clear=True,
+    ):
+        command_buffer = self.ctx.device.create_command_buffer()
+        with command_buffer.encode_render_commands(self.framebuffer) as encoder:
+            if clear:
+                command_buffer.clear_resource_view(
+                    self.ctx.output_texture.get_rtv(), [0.0, 0.0, 0.0, 1.0]
+                )
+            if viewport:
+                encoder.set_viewport_and_scissor_rect(viewport)
+            else:
+                encoder.set_viewport_and_scissor_rect(
+                    {
+                        "width": self.ctx.output_texture.width,
+                        "height": self.ctx.output_texture.height,
+                    }
+                )
+            shader_object = encoder.bind_pipeline(pipeline)
+            cursor = sgl.ShaderCursor(shader_object)
+            cursor.vert_offset = vert_offset
+            cursor.vert_scale = vert_scale
+            cursor.vert_z = float(vert_z)
+            cursor.frag_color = color
+            encoder.set_vertex_buffer(0, self.vertex_buffer)
+            encoder.set_index_buffer(self.index_buffer, sgl.Format.r32_uint, 0)
+            encoder.set_primitive_topology(sgl.PrimitiveTopology.triangle_list)
+            encoder.draw_indexed(int(self.index_buffer.size / 4))
+        command_buffer.submit()
+
+    # Helper to create pipeline with given set of args + correct program/layouts.
+    def create_graphics_pipeline(self, **kwargs):
+        return self.ctx.device.create_graphics_pipeline(
+            program=self.program,
+            input_layout=self.input_layout,
+            framebuffer_layout=self.framebuffer.layout,
+            **kwargs,
+        )
+
+    # Helper to both create pipeline and then use it to draw quad.
+    def draw_graphics_pipeline(
+        self,
+        vert_offset=sgl.float2(0, 0),
+        vert_scale=sgl.float2(1, 1),
+        vert_z=0,
+        color=sgl.float4(0, 0, 0, 0),
+        clear=True,
+        viewport: sgl.Viewport = None,
+        **kwargs,
+    ):
+        pipeline = self.create_graphics_pipeline(**kwargs)
+        self.draw(
+            pipeline,
+            color=color,
+            clear=clear,
+            vert_offset=vert_offset,
+            vert_scale=vert_scale,
+            vert_z=vert_z,
+            viewport=viewport,
+        )
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_gfx_simple_primitive(device_type):
+    ctx = PipelineTestContext(device_type)
+    gfx = GfxContext(ctx)
+
+    area = ctx.output_texture.width * ctx.output_texture.height
+    scale = sgl.float2(0.5)
+
+    # Clear and fill red, then verify 1/4 pixels are red and all solid.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 0, 0, 1),
+        vert_scale=scale,
+        rasterizer={"cull_mode": sgl.CullMode.back},
+    )
+    ctx.expect_counts([int(area / 4), 0, 0, area])
+
+    # Repeat with no culling, so should get same result.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(0, 1, 0, 1),
+        vert_scale=scale,
+        rasterizer={"cull_mode": sgl.CullMode.none},
+    )
+    ctx.expect_counts([0, int(area / 4), 0, area])
+
+    # Repeat with front face culling, so should get all black.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 1, 1, 1),
+        vert_scale=scale,
+        rasterizer={"cull_mode": sgl.CullMode.front},
+    )
+    ctx.expect_counts([0, 0, 0, area])
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_gfx_viewport(device_type):
+    ctx = PipelineTestContext(device_type)
+    gfx = GfxContext(ctx)
+
+    area = ctx.output_texture.width * ctx.output_texture.height
+    scale = sgl.float2(0.5)
+
+    # Clear and fill red, and verify it filled the whole screen.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 0, 0, 1), rasterizer={"cull_mode": sgl.CullMode.back}
+    )
+    ctx.expect_counts([area, 0, 0, area])
+
+    # Use viewport to clear half the screen.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(0, 1, 0, 1),
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        viewport=sgl.Viewport(
+            {
+                "width": int(ctx.output_texture.width / 2),
+                "height": ctx.output_texture.height,
+            }
+        ),
+    )
+    ctx.expect_counts([0, int(area / 2), 0, area])
+
+    # Same using horiontal clip instead.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(0, 1, 0, 1),
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        viewport=sgl.Viewport(
+            {
+                "width": ctx.output_texture.width,
+                "height": int(ctx.output_texture.height / 2),
+            }
+        ),
+    )
+    ctx.expect_counts([0, int(area / 2), 0, area])
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_gfx_depth(device_type):
+    ctx = PipelineTestContext(device_type)
+    gfx = GfxContext(ctx)
+
+    # Create a depth texture and re-create frame buffer that uses depth.
+    depth_texture = ctx.device.create_texture(
+        format=sgl.Format.d32_float,
+        width=ctx.output_texture.width,
+        height=ctx.output_texture.height,
+        usage=sgl.ResourceUsage.shader_resource | sgl.ResourceUsage.depth_stencil,
+        debug_name="depth_texture",
+    )
+    gfx.framebuffer = ctx.device.create_framebuffer(
+        render_targets=[ctx.output_texture.get_rtv()],
+        depth_stencil=depth_texture.get_dsv(),
+    )
+
+    area = ctx.output_texture.width * ctx.output_texture.height
+
+    # Manually clear both buffers and verify results.
+    command_buffer = ctx.device.create_command_buffer()
+    with command_buffer.encode_render_commands(gfx.framebuffer) as encoder:
+        command_buffer.clear_resource_view(
+            ctx.output_texture.get_rtv(), [0.0, 0.0, 0.0, 1.0]
+        )
+        command_buffer.clear_resource_view(depth_texture.get_dsv(), 0.5, 0, True, True)
+    command_buffer.submit()
+    ctx.expect_counts([0, 0, 0, area])
+
+    # Write quad with z=0.25, which is close than the z buffer clear value of 0.5 so should come through.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 0, 0, 1),
+        clear=False,
+        vert_scale=sgl.float2(0.5),
+        vert_z=0.25,
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        depth_stencil={
+            "depth_test_enable": True,
+            "depth_write_enable": True,
+            "depth_func": sgl.ComparisonFunc.less,
+        },
+    )
+    ctx.expect_counts([int(area / 4), 0, 0, area])
+
+    # Write a great big quad at z=0.75, which should do nothing.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 1, 1, 1),
+        clear=False,
+        vert_z=0.75,
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        depth_stencil={
+            "depth_test_enable": True,
+            "depth_write_enable": True,
+            "depth_func": sgl.ComparisonFunc.less,
+        },
+    )
+    ctx.expect_counts([int(area / 4), 0, 0, area])
+
+    # Write a great big quad at z=0.4, which should overwrite the background but not the foreground.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 1, 1, 1),
+        clear=False,
+        vert_z=0.4,
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        depth_stencil={
+            "depth_test_enable": True,
+            "depth_write_enable": True,
+            "depth_func": sgl.ComparisonFunc.less,
+        },
+    )
+    ctx.expect_counts([area, area - int(area / 4), area - int(area / 4), area])
+
+    # Write a great big quad at z=0.75 with depth func always, which should just blat the lot.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(0, 0, 1, 1),
+        clear=False,
+        vert_z=0.75,
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        depth_stencil={
+            "depth_test_enable": True,
+            "depth_write_enable": True,
+            "depth_func": sgl.ComparisonFunc.always,
+        },
+    )
+    ctx.expect_counts([0, 0, area, area])
+
+    # Quick check that the depth write happened correctly
+    dt = depth_texture.to_numpy()
+    assert np.all(dt == 0.75)
+
+    # Try again at z=0.8, which should do nothing as z write was still enabled with the previous one.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 1, 1, 1),
+        clear=False,
+        vert_z=0.8,
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        depth_stencil={
+            "depth_test_enable": True,
+            "depth_write_enable": True,
+            "depth_func": sgl.ComparisonFunc.less,
+        },
+    )
+    ctx.expect_counts([0, 0, area, area])
+
+    # Write out a full quad at z=0.25, with z write turned off, so should work but not affect z buffer.
+    gfx.draw_graphics_pipeline(
+        color=sgl.float4(1, 0, 0, 1),
+        clear=False,
+        vert_z=0.25,
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        depth_stencil={
+            "depth_test_enable": True,
+            "depth_write_enable": True,
+            "depth_func": sgl.ComparisonFunc.less,
+        },
+    )
+    ctx.expect_counts([area, 0, 0, area])
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+def test_gfx_blend(device_type):
+    ctx = PipelineTestContext(device_type)
+    gfx = GfxContext(ctx)
+    area = ctx.output_texture.width * ctx.output_texture.height
+
+    # Clear and then draw semi transparent red quad, and should get 1/4 dark red pixels.
+    gfx.draw_graphics_pipeline(
+        clear=True,
+        color=sgl.float4(1, 0, 0, 0.5),
+        vert_scale=sgl.float2(0.5),
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        blend=sgl.BlendDesc(
+            {
+                "alpha_to_coverage_enable": False,
+                "targets": [
+                    {
+                        "enable_blend": True,
+                        "color": {
+                            "src_factor": sgl.BlendFactor.src_alpha,
+                            "dst_factor": sgl.BlendFactor.inv_src_alpha,
+                            "op": sgl.BlendOp.add,
+                        },
+                        "alpha": {
+                            "src_factor": sgl.BlendFactor.zero,
+                            "dst_factor": sgl.BlendFactor.one,
+                            "op": sgl.BlendOp.add,
+                        },
+                    }
+                ],
+            }
+        ),
+    )
+    pixels = ctx.output_texture.to_numpy()
+    is_pixel_red = np.all(pixels[:, :, :3] == [0.5, 0, 0], axis=2)
+    assert np.sum(is_pixel_red) == int(area / 4)
+
+
+# On Vulkan using 50% alpha coverage we get a checkerboard effect.
+@pytest.mark.parametrize("device_type", [sgl.DeviceType.vulkan])
+def test_gfx_alpha_coverage(device_type):
+    ctx = PipelineTestContext(device_type)
+    gfx = GfxContext(ctx)
+    area = ctx.output_texture.width * ctx.output_texture.height
+
+    # Clear and then draw semi transparent red quad, and should end up
+    # with 1/8 of the pixels red due to alpha coverage.
+    gfx.draw_graphics_pipeline(
+        clear=True,
+        color=sgl.float4(1, 0, 0, 0.5),
+        vert_scale=sgl.float2(0.5),
+        rasterizer={"cull_mode": sgl.CullMode.back},
+        blend=sgl.BlendDesc(
+            {
+                "alpha_to_coverage_enable": True,
+                "targets": [
+                    {
+                        "enable_blend": True,
+                        "color": {"src_factor": sgl.BlendFactor.src_alpha},
+                    }
+                ],
+            }
+        ),
+    )
+
+    pixels = ctx.output_texture.to_numpy()
+    is_pixel_red = np.all(pixels[:, :, :3] == [0.5, 0, 0], axis=2)
+    assert np.sum(is_pixel_red) == int(area / 8)
+
+
+class RayContext:
+    def __init__(self, ctx: PipelineTestContext) -> None:
+        self.ctx = ctx
+
+        vertices = np.array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0], dtype=np.float32)
+        indices = np.array([0, 1, 2, 1, 3, 2], dtype=np.uint32)
+
+        vertex_buffer = ctx.device.create_buffer(
+            usage=sgl.ResourceUsage.shader_resource,
+            debug_name="vertex_buffer",
+            data=vertices,
+        )
+
+        index_buffer = ctx.device.create_buffer(
+            usage=sgl.ResourceUsage.shader_resource,
+            debug_name="index_buffer",
+            data=indices,
+        )
+
+        transform_buffer = ctx.device.create_buffer(
+            usage=sgl.ResourceUsage.shader_resource,
+            debug_name="transform_buffer",
+            data=sgl.float3x4.identity().to_numpy(),
+        )
+
+        blas_geometry_desc = sgl.RayTracingGeometryDesc()
+        blas_geometry_desc.type = sgl.RayTracingGeometryType.triangles
+        blas_geometry_desc.flags = sgl.RayTracingGeometryFlags.opaque
+        blas_geometry_desc.triangles.transform3x4 = transform_buffer.device_address
+        blas_geometry_desc.triangles.index_format = sgl.Format.r32_uint
+        blas_geometry_desc.triangles.vertex_format = sgl.Format.rgb32_float
+        blas_geometry_desc.triangles.index_count = indices.size
+        blas_geometry_desc.triangles.vertex_count = vertices.size // 3
+        blas_geometry_desc.triangles.index_data = index_buffer.device_address
+        blas_geometry_desc.triangles.vertex_data = vertex_buffer.device_address
+        blas_geometry_desc.triangles.vertex_stride = vertices.itemsize * 3
+
+        blas_build_inputs = sgl.AccelerationStructureBuildInputs()
+        blas_build_inputs.kind = sgl.AccelerationStructureKind.bottom_level
+        blas_build_inputs.flags = sgl.AccelerationStructureBuildFlags.none
+        blas_build_inputs.geometry_descs = [blas_geometry_desc]
+
+        blas_prebuild_info = ctx.device.get_acceleration_structure_prebuild_info(
+            blas_build_inputs
+        )
+
+        blas_scratch_buffer = ctx.device.create_buffer(
+            size=blas_prebuild_info.scratch_data_size,
+            usage=sgl.ResourceUsage.unordered_access,
+            debug_name="blas_scratch_buffer",
+        )
+
+        blas_buffer = ctx.device.create_buffer(
+            size=blas_prebuild_info.result_data_max_size,
+            usage=sgl.ResourceUsage.acceleration_structure,
+            debug_name="blas_buffer",
+        )
+
+        blas = ctx.device.create_acceleration_structure(
+            kind=sgl.AccelerationStructureKind.bottom_level,
+            buffer=blas_buffer,
+            size=blas_buffer.size,
+        )
+
+        command_buffer = ctx.device.create_command_buffer()
+        with command_buffer.encode_ray_tracing_commands() as encoder:
+            encoder.build_acceleration_structure(
+                inputs=blas_build_inputs,
+                dst=blas,
+                scratch_data=blas_scratch_buffer.device_address,
+            )
+        command_buffer.submit()
+
+        self.blas = blas
+
+    def create_instances(self, instance_transforms: np.ndarray):
+
+        instances: list[sgl.RayTracingInstanceDesc] = []
+        for i, transform in enumerate(instance_transforms):
+            instance_desc = sgl.RayTracingInstanceDesc()
+            instance_desc.transform = transform
+            instance_desc.instance_id = i
+            instance_desc.instance_mask = 0xFF
+            instance_desc.instance_contribution_to_hit_group_index = 0
+            instance_desc.flags = sgl.RayTracingInstanceFlags.none
+            instance_desc.acceleration_structure = self.blas.device_address
+            instances.append(instance_desc)
+
+        instance_buffer = self.ctx.device.create_buffer(
+            usage=sgl.ResourceUsage.shader_resource,
+            debug_name="instance_buffer",
+            data=np.stack([i.to_numpy() for i in instances]),
+        )
+
+        tlas_build_inputs = sgl.AccelerationStructureBuildInputs()
+        tlas_build_inputs.kind = sgl.AccelerationStructureKind.top_level
+        tlas_build_inputs.flags = sgl.AccelerationStructureBuildFlags.none
+        tlas_build_inputs.desc_count = len(instances)
+        tlas_build_inputs.instance_descs = instance_buffer.device_address
+
+        tlas_prebuild_info = self.ctx.device.get_acceleration_structure_prebuild_info(
+            tlas_build_inputs
+        )
+
+        tlas_scratch_buffer = self.ctx.device.create_buffer(
+            size=tlas_prebuild_info.scratch_data_size,
+            usage=sgl.ResourceUsage.unordered_access,
+            debug_name="tlas_scratch_buffer",
+        )
+
+        tlas_buffer = self.ctx.device.create_buffer(
+            size=tlas_prebuild_info.result_data_max_size,
+            usage=sgl.ResourceUsage.acceleration_structure,
+            debug_name="tlas_buffer",
+        )
+
+        tlas = self.ctx.device.create_acceleration_structure(
+            kind=sgl.AccelerationStructureKind.top_level,
+            buffer=tlas_buffer,
+            size=tlas_buffer.size,
+        )
+
+        command_buffer = self.ctx.device.create_command_buffer()
+        with command_buffer.encode_ray_tracing_commands() as encoder:
+            encoder.build_acceleration_structure(
+                inputs=tlas_build_inputs,
+                dst=tlas,
+                scratch_data=tlas_scratch_buffer.device_address,
+            )
+        command_buffer.submit()
+
+        return tlas
+
+    def dispatch_ray_grid(self, tlas: sgl.AccelerationStructure, mode: str):
+        if mode == "compute":
+            self.dispatch_ray_grid_compute(tlas)
+        elif mode == "ray":
+            self.dispatch_ray_grid_rtp(tlas)
+        else:
+            raise ValueError(f"Unknown mode {mode}")
+
+    def dispatch_ray_grid_compute(self, tlas: sgl.AccelerationStructure):
+        program = self.ctx.device.load_program("test_pipeline_utils.slang", ["raygrid"])
+        kernel = self.ctx.device.create_compute_kernel(program)
+        kernel.dispatch(
+            thread_count=[
+                self.ctx.output_texture.width,
+                self.ctx.output_texture.height,
+                1,
+            ],
+            render_texture=self.ctx.output_texture,
+            tlas=tlas,
+            pos=sgl.int2(0, 0),
+            size=sgl.int2(
+                self.ctx.output_texture.width, self.ctx.output_texture.height
+            ),
+            dist=float(2),
+        )
+
+    def dispatch_ray_grid_rtp(self, tlas: sgl.AccelerationStructure):
+        program = self.ctx.device.load_program(
+            "test_pipeline_utils.slang", ["rt_ray_gen", "rt_miss", "rt_closest_hit"]
+        )
+        pipeline = self.ctx.device.create_ray_tracing_pipeline(
+            program=program,
+            hit_groups=[
+                sgl.HitGroupDesc(
+                    hit_group_name="hit_group", closest_hit_entry_point="rt_closest_hit"
+                )
+            ],
+            max_recursion=1,
+            max_ray_payload_size=16,
+        )
+
+        shader_table = self.ctx.device.create_shader_table(
+            program=program,
+            ray_gen_entry_points=["rt_ray_gen"],
+            miss_entry_points=["rt_miss"],
+            hit_group_names=["hit_group"],
+        )
+
+        command_buffer = self.ctx.device.create_command_buffer()
+        with command_buffer.encode_ray_tracing_commands() as encoder:
+            shader_object = encoder.bind_pipeline(pipeline)
+            cursor = sgl.ShaderCursor(shader_object)
+            cursor.rt_tlas = tlas
+            cursor.rt_render_texture = self.ctx.output_texture
+            encoder.dispatch_rays(
+                0,
+                shader_table,
+                [self.ctx.output_texture.width, self.ctx.output_texture.height, 1],
+            )
+        command_buffer.submit()
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+@pytest.mark.parametrize("mode", ["compute", "ray"])
+def test_raytrace_simple(device_type, mode):
+    ctx = PipelineTestContext(
+        device_type,
+    )
+    rtx = RayContext(ctx)
+
+    # Setup instance transform causes the [0-1] quad to cover the top left
+    # quarter of the screen. This is basically pixels 0-63, so we scale it up
+    # a bit to handle rounding issues. The quad is at z=1 so should be visible.
+    tf = sgl.math.mul(
+        sgl.math.matrix_from_translation(sgl.float3(-0.05, -0.05, 1)),
+        sgl.math.matrix_from_scaling(sgl.float3(63.1, 63.1, 1)),
+    )
+    tf = sgl.float3x4(tf)
+    tlas = rtx.create_instances([tf])
+
+    # Load and run the ray tracing kernel that fires a grid of rays
+    # The grid covers the whole texture, and rays have length of 2 so
+    # should hit the quad and turn the pixels red.
+    rtx.dispatch_ray_grid(tlas, mode)
+
+    # Check the 64x64 pixels are now red
+    pixels = ctx.output_texture.to_numpy()
+    is_pixel_red = np.all(pixels[:, :, :3] == [1, 0, 0], axis=2)
+    num_red = np.sum(is_pixel_red)
+    assert num_red == 4096
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+@pytest.mark.parametrize("mode", ["compute", "ray"])
+def test_raytrace_two_instance(device_type, mode):
+    ctx = PipelineTestContext(device_type)
+    rtx = RayContext(ctx)
+
+    # Ray trace against 2 instances, in top left and bottom right.
+    transforms = []
+    transforms.append(
+        sgl.math.mul(
+            sgl.math.matrix_from_translation(sgl.float3(-0.05, -0.05, 1)),
+            sgl.math.matrix_from_scaling(sgl.float3(63.1, 63.1, 1)),
+        )
+    )
+    transforms.append(
+        sgl.math.mul(
+            sgl.math.matrix_from_translation(sgl.float3(64 - 0.05, 64 - 0.05, 1)),
+            sgl.math.matrix_from_scaling(sgl.float3(63.1, 63.1, 1)),
+        )
+    )
+
+    tlas = rtx.create_instances([sgl.float3x4(x) for x in transforms])
+    rtx.dispatch_ray_grid(tlas, mode)
+
+    # Expect 2 64x64 squares, with red from 1st instance and green from 2nd.
+    pixels = ctx.output_texture.to_numpy()
+    is_pixel_red = np.all(pixels[:, :, :3] == [1, 0, 0], axis=2)
+    is_pixel_green = np.all(pixels[:, :, :3] == [0, 1, 0], axis=2)
+    assert np.sum(is_pixel_red) == 4096
+    assert np.sum(is_pixel_green) == 4096
+
+
+@pytest.mark.parametrize("device_type", helpers.DEFAULT_DEVICE_TYPES)
+@pytest.mark.parametrize("mode", ["compute", "ray"])
+def test_raytrace_closest_instance(device_type, mode):
+    ctx = PipelineTestContext(device_type)
+    rtx = RayContext(ctx)
+
+    # Ray trace against 2 instances, slightly overlapping,
+    # with centre one closer.
+    transforms = []
+    transforms.append(
+        sgl.math.mul(
+            sgl.math.matrix_from_translation(sgl.float3(-0.05, -0.05, 1)),
+            sgl.math.matrix_from_scaling(sgl.float3(63.1, 63.1, 1)),
+        )
+    )
+    transforms.append(
+        sgl.math.mul(
+            sgl.math.matrix_from_translation(sgl.float3(32 - 0.05, 32 - 0.05, 0.5)),
+            sgl.math.matrix_from_scaling(sgl.float3(63.1, 63.1, 1)),
+        )
+    )
+
+    tlas = rtx.create_instances([sgl.float3x4(x) for x in transforms])
+    rtx.dispatch_ray_grid(tlas, mode)
+
+    # Expect full green square, and only 3/4 of red square.
+    pixels = ctx.output_texture.to_numpy()
+    is_pixel_red = np.all(pixels[:, :, :3] == [1, 0, 0], axis=2)
+    is_pixel_green = np.all(pixels[:, :, :3] == [0, 1, 0], axis=2)
+    assert np.sum(is_pixel_red) == 3072
+    assert np.sum(is_pixel_green) == 4096
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])
diff --git a/src/sgl/device/tests/test_pipeline_utils.slang b/src/sgl/device/tests/test_pipeline_utils.slang
new file mode 100644
index 00000000..b42a911f
--- /dev/null
+++ b/src/sgl/device/tests/test_pipeline_utils.slang
@@ -0,0 +1,148 @@
+
+[shader("compute")]
+[numthreads(16, 16, 1)]
+void clear(uint2 tid: SV_DispatchThreadID, RWTexture2D<float4> render_texture)
+{
+    uint2 dim;
+    render_texture.GetDimensions(dim.x, dim.y);
+    if (any(tid.xy >= dim))
+        return;
+    render_texture[tid.xy] = float4(0);
+}
+
+[shader("compute")]
+[numthreads(16, 16, 1)]
+void count(uint2 tid: SV_DispatchThreadID, Texture2D<float4> render_texture, RWByteAddressBuffer count_buffer)
+{
+    uint2 dim;
+    render_texture.GetDimensions(dim.x, dim.y);
+    if (any(tid.xy >= dim))
+        return;
+    float4 val = render_texture[tid.xy];
+    count_buffer.InterlockedAdd(0, val.x > 0 ? 1 : 0);
+    count_buffer.InterlockedAdd(4, val.y > 0 ? 1 : 0);
+    count_buffer.InterlockedAdd(8, val.z > 0 ? 1 : 0);
+    count_buffer.InterlockedAdd(12, val.w > 0 ? 1 : 0);
+}
+
+[shader("compute")]
+[numthreads(16, 16, 1)]
+void setcolor(
+    uint2 tid: SV_DispatchThreadID,
+    RWTexture2D<float4> render_texture,
+    uniform int2 pos,
+    uniform int2 size,
+    uniform float4 color
+)
+{
+    if (any(tid.xy >= size))
+        return;
+    render_texture[tid.xy + pos] = color;
+}
+
+
+struct V2F {
+    float4 pos : SV_Position;
+};
+
+uniform float2 vert_offset;
+uniform float2 vert_scale;
+uniform float vert_z;
+
+[shader("vertex")]
+V2F vertex_main(float3 pos: POSITION)
+{
+    V2F o;
+    o.pos = float4(pos.xy * vert_scale.xy + vert_offset.xy, vert_z, 1);
+    return o;
+}
+
+uniform float4 frag_color;
+
+[shader("fragment")]
+float4 fragment_main(V2F v)
+    : SV_Target
+{
+    return frag_color;
+}
+
+
+[shader("compute")]
+[numthreads(16, 16, 1)]
+void raygrid(
+    uint2 tid: SV_DispatchThreadID,
+    RWTexture2D<float4> render_texture,
+    RaytracingAccelerationStructure tlas,
+    uniform int2 pos,
+    uniform int2 size,
+    uniform float dist
+)
+{
+    if (any(tid.xy >= size))
+        return;
+
+    RayDesc ray;
+    ray.Origin = float3(tid.xy, 0);
+    ray.Direction = float3(0, 0, 1);
+    ray.TMin = 0;
+    ray.TMax = 2;
+
+    RayQuery<RAY_FLAG_FORCE_OPAQUE | RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES | RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH> q;
+    q.TraceRayInline(tlas, 0, 0xff, ray);
+    q.Proceed();
+    if (q.CommittedStatus() == COMMITTED_TRIANGLE_HIT) {
+        float4 color = float4(0, 0, 0, 1);
+        color[q.CommittedInstanceID() % 3] = 1;
+        render_texture[tid.xy] = color;
+    } else {
+        render_texture[tid.xy] = float4(0, 0, 0, 1);
+    }
+}
+
+struct Payload {
+    float4 color;
+}
+
+[shader("miss")]
+void rt_miss(inout Payload payload)
+{
+    payload.color = float4(0, 0, 0, 1);
+}
+
+[shader("closesthit")]
+void rt_closest_hit(inout Payload payload, BuiltInTriangleIntersectionAttributes attribs)
+{
+    float4 col = float4(0, 0, 0, 1);
+    col[InstanceID() % 3] = 1;
+    payload.color = col;
+}
+
+uniform RWTexture2D<float4> rt_render_texture;
+uniform RaytracingAccelerationStructure rt_tlas;
+
+[shader("raygeneration")]
+void rt_ray_gen()
+{
+    uint2 pixel = DispatchRaysIndex().xy;
+
+    RayDesc ray;
+    ray.Origin = float3(pixel.xy, 0);
+    ray.Direction = float3(0, 0, 1);
+    ray.TMin = 0;
+    ray.TMax = 2;
+
+    Payload payload = {};
+
+    TraceRay(
+        rt_tlas,
+        0,
+        0xff,
+        0 /* RayContributionToHitGroupIndex */,
+        0 /* MultiplierForGeometryContributionHitGroupIndex */,
+        0 /* MissShaderIndex */,
+        ray,
+        payload
+    );
+
+    rt_render_texture[pixel] = payload.color;
+}