[js/webgpu] Optimize ConvTranspose (Continue) (#23429)

BUG #23273 This PR does below optimizations: 1. When output channels is one, 1) calculate the offset before the inchannel loop to reduce indices to offsets calculation, 2) split the `inputChannelsPerGroup` into `inputChannelsPerGroupInt` and `inputChannelsRemainder` parts so that we can always access 4 data for `inputChannelsPerGroupInt`. 2. Use precise initial value to reduce useless loop iterations. Thanks @jiangzhaoming 's suggestion's on this. With this PR, ConvTranspose becomes 3.7s from 8.4s on Intel Meteor Lake. On NV RTX 2000 Ada, it becomes 1.6s from 2.7s.
microsoft · Jan 22, 2025 · 25f4274 · 25f4274
1 parent ff8465e
commit 25f4274
Show file tree

Hide file tree

Showing 2 changed files with 236 additions and 24 deletions.
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -46,6 +46,11 @@ export const createConvTranspose2DProgramInfo = (
   const inputChannelsPerGroup = wShape[2] / group;
   const outputChannelsPerGroup = wShape[3];
   const aComponents = isChannelsLast ? getMaxComponents(inputChannelsPerGroup) : 1;
+  const packInputAs4 = isChannelsLast && outputChannelsPerGroup === 1;
+  const inputChannelsPerGroupInt = packInputAs4
+    ? Math.floor(inputChannelsPerGroup / 4) * 4
+    : Math.floor(inputChannelsPerGroup / aComponents) * aComponents;
+  const inputChannelsRemainder = inputChannelsPerGroup - inputChannelsPerGroupInt;
   const components = isChannelsLast ? getMaxComponents(outputChannelsPerGroup) : 1;
   const bComponents = isChannelsLast ? (outputChannelsPerGroup === 1 ? aComponents : components) : 1;
   const outputSize = ShapeUtil.size(outputShape) / components;
@@ -78,7 +83,7 @@ export const createConvTranspose2DProgramInfo = (
     { type: DataType.uint32, data: dilations },
     { type: DataType.uint32, data: effectiveFilterDims },
     { type: DataType.int32, data: pads },
-    { type: DataType.uint32, data: inputChannelsPerGroup },
+    { type: DataType.uint32, data: inputChannelsPerGroupInt },
     { type: DataType.uint32, data: outputChannelsPerGroup },
     ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims),
   ];
@@ -114,16 +119,40 @@ export const createConvTranspose2DProgramInfo = (
 
     const calculateResult = (): string => {
       let calcStr = '';
-      if (aComponents === 1) {
-        calcStr += `
-        let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
-        let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
-        dotProd = dotProd + xValue * wValue;`;
+      if (packInputAs4) {
+        if (aComponents === 4) {
+          calcStr += `
+        let xValue = ${dy.getByOffset('x_offset')};
+        let wValue = ${w.getByOffset('w_offset')};
+        dotProd = dotProd + dot(xValue, wValue);
+        x_offset += 1u;
+        w_offset += 1u;`;
+        } else if (aComponents === 2) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}));
+          x_offset += 2u;
+          w_offset += 2u;`;
+        } else if (aComponents === 1) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}, ${dy.getByOffset('x_offset + 2u')}, ${dy.getByOffset('x_offset + 3u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}, ${w.getByOffset('w_offset + 2u')}, ${w.getByOffset('w_offset + 3u')}));
+          x_offset += 4u;
+          w_offset += 4u;`;
+        }
       } else {
-        if (outputChannelsPerGroup === 1) {
+        calcStr += `
+                  let xValue = ${
+                    isChannelsLast
+                      ? dy.getByOffset(
+                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
+                        )
+                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
+                  };
+        `;
+        if (aComponents === 1) {
           calcStr += `
-          let wValue = ${w.getByOffset(`${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)} / ${bComponents}`)};
-          dotProd = dotProd + dot(xValue, wValue);`;
+          let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+          let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
+          dotProd = dotProd + xValue * wValue;`;
         } else {
           for (let c = 0; c < aComponents; c++) {
             calcStr += `
@@ -134,6 +163,32 @@ export const createConvTranspose2DProgramInfo = (
       }
       return calcStr;
     };
+    const calculateRemainder = (): string => {
+      if (inputChannelsRemainder === 0) {
+        return '';
+      }
+      if (!packInputAs4) {
+        throw new Error(`packInputAs4 ${packInputAs4} is not true.`);
+      }
+      let calcStr = '';
+      if (aComponents === 1) {
+        calcStr += 'dotProd = dotProd';
+        for (let i = 0; i < inputChannelsRemainder; i++) {
+          calcStr += `
+            + ${dy.getByOffset(`x_offset + ${i}`)} * ${w.getByOffset(`w_offset + ${i}`)}`;
+        }
+        calcStr += ';';
+      } else if (aComponents === 2) {
+        if (inputChannelsRemainder !== 2) {
+          throw new Error(`Invalid inputChannelsRemainder ${inputChannelsRemainder}.`);
+        }
+        calcStr += `
+          let xValue = ${dy.getByOffset('x_offset')};
+          let wValue = ${w.getByOffset('w_offset')};
+          dotProd = dotProd + dot(xValue, wValue);`;
+      }
+      return calcStr;
+    };
     const codeSnippet = `
             let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
             let batch = ${output.indicesGet('outputIndices', 0)};
@@ -148,7 +203,12 @@ export const createConvTranspose2DProgramInfo = (
             // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
             // ? = to be determined. : = across all values in that axis.
             var dotProd = ${output.type.value}(0.0);
-            for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+            var wR: u32 = 0;
+            if (uniforms.dilations.x == 1) {
+              // Minimum wR >= 0 that satisfies (dyRCorner + wR) % (uniforms.strides.x) == 0
+              wR = u32(((dyRCorner + i32(uniforms.strides.x) - 1) / i32(uniforms.strides.x)) * i32(uniforms.strides.x) - dyRCorner);
+            }
+            for (; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
               if (wR % uniforms.dilations.x != 0) {
                 continue;
               }
@@ -158,10 +218,13 @@ export const createConvTranspose2DProgramInfo = (
                   wRPerm < 0) {
                 continue;
               }
-              wR = wR + uniforms.strides[0] - 1;
               let idyR: u32 = u32(dyR);
-
-              for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+              var wC: u32 = 0;
+              if (uniforms.dilations.y == 1) {
+                // Minimum wC >= 0 that satisfies (dyCCorner + wC) % (uniforms.strides.y) == 0
+                wC = u32(((dyCCorner + i32(uniforms.strides.y) - 1) / i32(uniforms.strides.y)) * i32(uniforms.strides.y) - dyCCorner);
+              }
+              for (; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
                 if (wC % uniforms.dilations.y != 0) {
                   continue;
                 }
@@ -171,21 +234,24 @@ export const createConvTranspose2DProgramInfo = (
                     fract(dyC) > 0.0 || wCPerm < 0) {
                   continue;
                 }
-                wC = wC + uniforms.strides.y - 1;
                 let idyC: u32 = u32(dyC);
                 var inputChannel = groupId * uniforms.input_channels_per_group;
-                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${aComponents}) {
-                  let xValue = ${
-                    isChannelsLast
-                      ? dy.getByOffset(
-                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
-                        )
-                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
-                  };
+                ${
+                  packInputAs4
+                    ? `
+                var x_offset = ${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents};
+                var w_offset = ${w.indicesToOffset(`${w.type.indices}(wRPerm, wCPerm, inputChannel, wOutChannel)`)} / ${bComponents};
+                  `
+                    : ''
+                }
+                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${packInputAs4 ? 4 : aComponents}) {
                   ${calculateResult()}
-                  inputChannel = inputChannel + ${aComponents};
+                  inputChannel = inputChannel + ${packInputAs4 ? 4 : aComponents};
                 }
+                ${calculateRemainder()}
+                wC = wC + uniforms.strides.y - 1;
               }
+              wR = wR + uniforms.strides[0] - 1;
             }
             let value = dotProd${hasBias ? ` + bias[d1 / ${components}]` : ''};
             ${output.setByOffset('global_idx', 'value')};
@@ -201,7 +267,7 @@ export const createConvTranspose2DProgramInfo = (
   return {
     name: 'ConvTranspose2D',
     shaderCache: {
-      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}`,
+      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}${inputChannelsRemainder}`,
       inputDependencies,
     },
     getRunData: () => ({

diff --git a/js/web/test/data/ops/conv-transpose.jsonc b/js/web/test/data/ops/conv-transpose.jsonc
@@ -458,6 +458,152 @@
       }
     ]
   },
+  {
+    "name": "ConvTranspose with output channels = 1",
+    "operator": "ConvTranspose",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "inChannels = 5",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45
+            ],
+            "dims": [1, 5, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [5, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              437, 532, 458, 558, 479, 584, 627, 722, 658, 758, 689, 794, 500, 610, 521, 636, 542, 662, 720, 830, 751,
+              866, 782, 902, 563, 688, 584, 714, 605, 740, 813, 938, 844, 974, 875, 1010
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 6",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9
+            ],
+            "dims": [1, 6, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4],
+            "dims": [6, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              438, 534, 460, 562, 482, 590, 630, 726, 664, 766, 698, 806, 504, 618, 526, 646, 548, 674, 732, 846, 766,
+              886, 800, 926, 570, 702, 592, 730, 614, 758, 834, 966, 868, 1006, 902, 1046
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 7",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18
+            ],
+            "dims": [1, 7, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [7, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              488, 594, 515, 628, 542, 662, 700, 806, 741, 854, 782, 902, 569, 696, 596, 730, 623, 764, 823, 950, 864,
+              998, 905, 1046, 650, 798, 677, 832, 704, 866, 946, 1094, 987, 1142, 1028, 1190
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 8",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18, 1, 2, 3, 4, 5, 6, 7, 8, 9
+            ],
+            "dims": [1, 8, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4],
+            "dims": [8, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              489, 596, 517, 632, 545, 668, 703, 810, 747, 862, 791, 914, 573, 704, 601, 740, 629, 776, 835, 966, 879,
+              1018, 923, 1070, 657, 812, 685, 848, 713, 884, 967, 1122, 1011, 1174, 1055, 1226
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "ConvTranspose without bias addition C",
     "operator": "ConvTranspose",