From f4264c9c4acc04746ae97d636a5edd89d650a9d0 Mon Sep 17 00:00:00 2001
From: Kevin Reid <kpreid@switchb.org>
Date: Wed, 25 Oct 2023 21:36:23 -0700
Subject: [PATCH] gpu: More efficient light texture updates.

Instead of issuing one `Queue::write_texture()` per cube, batch the data
into a single `Buffer` and copy from it. This is significantly faster,
at least on the CPU side; the time spent on heavy updates has been cut
from ~5 ms to ~1 ms.

Future work: use `StagingBelt` instead of using `write_buffer()`.
To do this optimally, `StagingBelt` will need to be modified to allow
us accessing its buffers to issue a `copy_buffer_to_texture` instead of
it issuing a `copy_buffer_to_buffer`.
---
 Cargo.lock                            |  1 +
 all-is-cubes-gpu/Cargo.toml           |  1 +
 all-is-cubes-gpu/src/in_wgpu/space.rs | 92 ++++++++++++++++++++++++---
 3 files changed, 85 insertions(+), 9 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 2485ca2cd..42d60da13 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -163,6 +163,7 @@ dependencies = [
  "half 2.3.1",
  "image",
  "instant",
+ "itertools 0.11.0",
  "log",
  "once_cell",
  "rand 0.8.5",
diff --git a/all-is-cubes-gpu/Cargo.toml b/all-is-cubes-gpu/Cargo.toml
index 7ca95a07f..d54f7aab4 100644
--- a/all-is-cubes-gpu/Cargo.toml
+++ b/all-is-cubes-gpu/Cargo.toml
@@ -43,6 +43,7 @@ futures-channel = { workspace = true, features = ["sink"] }
 futures-core = { workspace = true }
 futures-util = { workspace = true, features = ["sink"] }
 instant = { workspace = true }
+itertools = { workspace = true }
 log = { workspace = true }
 once_cell = { workspace = true }
 rand = { workspace = true }
diff --git a/all-is-cubes-gpu/src/in_wgpu/space.rs b/all-is-cubes-gpu/src/in_wgpu/space.rs
index 69c740aa0..1ec4ede63 100644
--- a/all-is-cubes-gpu/src/in_wgpu/space.rs
+++ b/all-is-cubes-gpu/src/in_wgpu/space.rs
@@ -6,6 +6,7 @@ use std::sync::{Arc, Mutex, Weak};
 use all_is_cubes::camera::{Camera, Flaws};
 use all_is_cubes::chunking::ChunkPos;
 use all_is_cubes::content::palette;
+use all_is_cubes::euclid::vec3;
 use all_is_cubes::listen::{Listen as _, Listener};
 use all_is_cubes::math::{
     Cube, Face6, FaceMap, FreeCoordinate, GridAab, GridCoordinate, GridPoint, GridVector, Rgb,
@@ -18,7 +19,9 @@ use all_is_cubes_mesh::dynamic::{ChunkedSpaceMesh, RenderDataUpdate};
 use all_is_cubes_mesh::{DepthOrdering, IndexSlice};
 
 use crate::in_wgpu::frame_texture::FramebufferTextures;
-use crate::in_wgpu::glue::{size_vector_to_extent, to_wgpu_index_format, write_texture_by_aab};
+use crate::in_wgpu::glue::{
+    point_to_origin, size_vector_to_extent, to_wgpu_index_format, write_texture_by_aab,
+};
 use crate::in_wgpu::pipelines::Pipelines;
 use crate::in_wgpu::vertex::{WgpuInstanceData, WgpuLinesVertex};
 use crate::in_wgpu::{
@@ -186,13 +189,9 @@ impl<I: time::Instant> SpaceRenderer<I> {
         let mut light_update_count = 0;
         if let Some(set) = &mut todo.light {
             // TODO: work in larger, ahem, chunks
-            for cube in set.drain() {
-                light_update_count += self.light_texture.update(
-                    queue,
-                    space,
-                    GridAab::from_lower_size(cube, [1, 1, 1]),
-                );
-            }
+            light_update_count +=
+                self.light_texture
+                    .update_scatter(device, queue, space, set.drain());
         } else {
             light_update_count += self.light_texture.update_all(queue, space);
             todo.light = Some(HashSet::new());
@@ -665,9 +664,14 @@ pub(in crate::in_wgpu) struct SpaceLightTexture {
     texture_view: wgpu::TextureView,
     /// The region of cube coordinates for which there are valid texels.
     texture_bounds: GridAab,
+    /// Temporary storage for updated light texels to be copied into the texture.
+    copy_buffer: wgpu::Buffer,
 }
 
 impl SpaceLightTexture {
+    const COPY_BUFFER_TEXELS: usize = 1024;
+    const COMPONENTS: usize = 4;
+
     /// Construct a new `SpaceLightTexture` for the specified size of [`Space`],
     /// with no data.
     pub fn new(label_prefix: &str, device: &wgpu::Device, bounds: GridAab) -> Self {
@@ -694,12 +698,18 @@ impl SpaceLightTexture {
             texture_view: texture.create_view(&wgpu::TextureViewDescriptor::default()),
             texture,
             texture_bounds,
+            copy_buffer: device.create_buffer(&wgpu::BufferDescriptor {
+                label: Some(&format!("{label_prefix} space light copy buffer")),
+                size: u64::try_from(Self::COPY_BUFFER_TEXELS * Self::COMPONENTS).unwrap(),
+                usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC,
+                mapped_at_creation: false,
+            }),
         }
     }
 
     /// Copy the specified region of light data.
     pub fn update(&mut self, queue: &wgpu::Queue, space: &Space, region: GridAab) -> usize {
-        let mut data: Vec<[u8; 4]> = Vec::with_capacity(region.volume());
+        let mut data: Vec<[u8; Self::COMPONENTS]> = Vec::with_capacity(region.volume());
         // TODO: Enable circular operation and eliminate the need for the offset of the
         // coordinates (texture_bounds.lower_bounds() and light_offset in the shader)
         // by doing a coordinate wrap-around -- the shader and the Space will agree
@@ -728,6 +738,70 @@ impl SpaceLightTexture {
         self.texture_bounds.volume()
     }
 
+    /// Copy many individual cubes of light data.
+    pub fn update_scatter(
+        &mut self,
+        device: &wgpu::Device,
+        queue: &wgpu::Queue,
+        space: &Space,
+        cubes: impl IntoIterator<Item = Cube>,
+    ) -> usize {
+        let mut total_count = 0;
+
+        // Break into batches of our buffer size.
+        for cube_batch in
+            itertools::Itertools::chunks(cubes.into_iter(), Self::COPY_BUFFER_TEXELS).into_iter()
+        {
+            let mut data: [[u8; Self::COMPONENTS]; Self::COPY_BUFFER_TEXELS] =
+                [[0; Self::COMPONENTS]; Self::COPY_BUFFER_TEXELS];
+            let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
+                label: Some("space light scatter-copy"),
+            });
+            let mut batch_count = 0;
+
+            for (index, cube) in cube_batch.into_iter().enumerate() {
+                data[index] = space.get_lighting(cube).as_texel();
+
+                // TODO: When compute shaders are available, use a compute shader to do these
+                // scattered writes instead of issuing individual commands.
+                encoder.copy_buffer_to_texture(
+                    wgpu::ImageCopyBuffer {
+                        buffer: &self.copy_buffer,
+                        layout: wgpu::ImageDataLayout {
+                            offset: (index * Self::COMPONENTS) as u64,
+                            bytes_per_row: None,
+                            rows_per_image: None,
+                        },
+                    },
+                    wgpu::ImageCopyTexture {
+                        texture: &self.texture,
+                        mip_level: 0,
+                        origin: point_to_origin(cube.lower_bounds() + self.light_lookup_offset()),
+                        aspect: wgpu::TextureAspect::All,
+                    },
+                    size_vector_to_extent(vec3(1, 1, 1)),
+                );
+
+                batch_count += 1;
+                total_count += 1;
+            }
+
+            // TODO: use `StagingBelt` to write buffer instead.
+            // To do this optimally, `StagingBelt` will need to be modified to allow
+            // us accessing its buffers to issue a `copy_buffer_to_texture` instead of
+            // it issuing a `copy_buffer_to_buffer`.
+            queue.write_buffer(
+                &self.copy_buffer,
+                0,
+                bytemuck::cast_slice::<[u8; Self::COMPONENTS], u8>(&data[..batch_count]),
+            );
+
+            queue.submit([encoder.finish()]);
+        }
+
+        total_count
+    }
+
     fn light_lookup_offset(&self) -> GridVector {
         -self.texture_bounds.lower_bounds().to_vector()
     }