diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp index 3762f7ce219..21f72b66675 100644 --- a/csrc/scheduler/vectorize_helper.cpp +++ b/csrc/scheduler/vectorize_helper.cpp @@ -398,24 +398,18 @@ std::vector ContiguousInnerDimensionsMapper::projectId( frontier.erase(frontier.begin(), it); if (recording_) { - if (it+1 == frontier.end()) { - // FIXME: real analysis is needed here - // TODO: test on a single sided pad. - auto consumer_factor = getProjectedExtent(id_from); - auto comp = [](Val* factor, Val* extent) { - return SimplifyingIrBuilder::whereExpr( - SimplifyingIrBuilder::eqExpr(extent, extent->container()->zeroVal()), - factor, - SimplifyingIrBuilder::gcdExpr(factor, extent)); - }; - consumer_factor = comp(consumer_factor, resize_op->leftExpand()); - consumer_factor = comp(consumer_factor, resize_op->rightExpand()); - addProjectedExtent(id_to, consumer_factor); - } else { - // pad vectorization can only be done at fastest dimension, project it to 0 I believe would avoid that. - // FIXME: add a test case for me - addProjectedExtent(id_to, id_to->container()->zeroVal()); - } + // FIXME: real analysis is needed here + // TODO: test on a single sided pad. + auto consumer_factor = getProjectedExtent(id_from); + auto comp = [](Val* factor, Val* extent) { + return SimplifyingIrBuilder::whereExpr( + SimplifyingIrBuilder::eqExpr(extent, extent->container()->zeroVal()), + factor, + SimplifyingIrBuilder::gcdExpr(factor, extent)); + }; + consumer_factor = comp(consumer_factor, resize_op->leftExpand()); + consumer_factor = comp(consumer_factor, resize_op->rightExpand()); + addProjectedExtent(id_to, consumer_factor); } } else { frontier.erase(frontier.begin(), it + 1); diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index e7ea0f6b651..5910c828739 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4146,7 +4146,7 @@ TEST_F(ResizeTest, UnrollNonInnermost) { auto tv0 = makeContigConcreteTensor(shape); fusion.addInput(tv0); - auto tv1 = pad(tv0, {IrBuilder::create(4L), IrBuilder::create(4L), IrBuilder::create(0L), IrBuilder::create(0L)}); + auto tv1 = pad(tv0, {IrBuilder::create(0L), IrBuilder::create(0L), IrBuilder::create(4L), IrBuilder::create(4L)}); fusion.addOutput(tv1); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -4156,7 +4156,7 @@ TEST_F(ResizeTest, UnrollNonInnermost) { FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); - auto ref = at::pad(t0, {4, 4, 0, 0}); + auto ref = at::pad(t0, {0, 0, 4, 4}); NVF_CHECK(ref.equal(cg_outputs[0])); } @@ -4190,4 +4190,37 @@ TEST_F(ResizeTest, PadAndCacheUses) { auto ref_1 = at::relu(t0); NVF_CHECK(ref_1.equal(cg_outputs[1])); } + +TEST_F(ResizeTest, Playground) { + auto fusion_ptr = std::make_unique(); + auto& fusion = *fusion_ptr; + FusionGuard fg(fusion_ptr.get()); + + const std::vector shape({1024L * 1024L}); + + // Using a concrete tensor to avoid dynamic reshape + auto tv0 = makeContigConcreteTensor(shape); + fusion.addInput(tv0); + + auto tv1 = pad(tv0, {IrBuilder::create(4L), IrBuilder::create(4L)}); + fusion.addOutput(tv1); + auto tv2 = slice( + tv0, + {{IrBuilder::create(2L), + sub(tv0->axis(0)->extent(), IrBuilder::create(2L))}}); + fusion.addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto t0 = at::randn(shape, options); + std::vector aten_inputs({t0}); + + FusionExecutorCache executor_cache(std::move(fusion_ptr)); + auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs); + + auto ref_0 = at::pad(t0, {4, 4}); + NVF_CHECK(ref_0.equal(cg_outputs[0])); + + auto ref_1 = t0.index({at::indexing::Slice(2, shape[0] - 2)}); + NVF_CHECK(ref_1.equal(cg_outputs[1])); +} } // namespace nvfuser