diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
index 8fb80735bd..0b979c9a14 100644
--- a/.github/scripts/fbgemm_gpu_build.bash
+++ b/.github/scripts/fbgemm_gpu_build.bash
@@ -159,17 +159,10 @@ __configure_fbgemm_gpu_build_rocm () {
   print_exec conda env config vars set ${env_prefix} PYTORCH_ROCM_ARCH="${arch_list}"
 
   echo "[BUILD] Setting ROCm build args ..."
-  # shellcheck disable=SC2155
-  local cxx_flags="-DTORCH_USE_HIP_DSA"
-
   build_args=(
     --package_variant=rocm
     # HIP_ROOT_DIR now required for HIP to be correctly detected by CMake
     -DHIP_ROOT_DIR=/opt/rocm
-    # Enable device-side assertions in HIP
-    # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
-    -DCMAKE_C_FLAGS="'${cxx_flags}'"
-    -DCMAKE_CXX_FLAGS="'${cxx_flags}'"
   )
 }
 
@@ -251,26 +244,14 @@ __configure_fbgemm_gpu_build_genai () {
   done
 }
 
+# shellcheck disable=SC2120
 __configure_fbgemm_gpu_build () {
-  local fbgemm_variant="$1"
-  local fbgemm_variant_targets="$2"
-  if [ "$fbgemm_variant" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
-    echo "Example(s):"
-    echo "    ${FUNCNAME[0]} cpu                          # CPU-only variant using Clang"
-    echo "    ${FUNCNAME[0]} cuda                         # CUDA variant for default target(s)"
-    echo "    ${FUNCNAME[0]} cuda '7.0;8.0'               # CUDA variant for custom target(s)"
-    echo "    ${FUNCNAME[0]} rocm                         # ROCm variant for default target(s)"
-    echo "    ${FUNCNAME[0]} rocm 'gfx906;gfx908;gfx90a'  # ROCm variant for custom target(s)"
-    return 1
-  else
-    echo "################################################################################"
-    echo "# Configure FBGEMM-GPU Build"
-    echo "#"
-    echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
-    echo "################################################################################"
-    echo ""
-  fi
+  echo "################################################################################"
+  echo "# Configure FBGEMM-GPU Build"
+  echo "#"
+  echo "# [$(date --utc +%FT%T.%3NZ)] + ${FUNCNAME[0]} ${*}"
+  echo "################################################################################"
+  echo ""
 
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
@@ -302,6 +283,13 @@ __configure_fbgemm_gpu_build () {
     --verbose
   )
 
+  # Set debugging options
+  if [ "$fbgemm_release_channel" != "release" ] || [ "$BUILD_DEBUG" -eq 1 ]; then
+    build_args+=(
+      --debug
+    )
+  fi
+
   # shellcheck disable=SC2145
   echo "[BUILD] FBGEMM_GPU build arguments have been set:  ${build_args[@]}"
 }
@@ -502,8 +490,8 @@ run_fbgemm_gpu_postbuild_checks () {
     return 1
   fi
 
-  __print_library_infos
-  __verify_library_symbols
+  __print_library_infos     || return 1
+  __verify_library_symbols  || return 1
 }
 
 ################################################################################
@@ -531,7 +519,7 @@ build_fbgemm_gpu_package () {
 
   # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
-  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+  __configure_fbgemm_gpu_build        || return 1
 
   echo "################################################################################"
   echo "# Build FBGEMM-GPU Package (Wheel)"
@@ -596,7 +584,7 @@ build_fbgemm_gpu_install () {
 
   # Set up and configure the build
   __build_fbgemm_gpu_common_pre_steps || return 1
-  __configure_fbgemm_gpu_build "${fbgemm_variant}" "${fbgemm_variant_targets}" || return 1
+  __configure_fbgemm_gpu_build        || return 1
 
   echo "################################################################################"
   echo "# Build + Install FBGEMM-GPU Package"
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 8bf477b1f2..6c93c53d02 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -31,19 +31,6 @@ __install_print_dependencies_info () {
   echo ""
 }
 
-__install_list_subpackages_info () {
-  # shellcheck disable=SC2086,SC2155
-  local fbgemm_gpu_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(dir(fbgemm_gpu))")
-  # shellcheck disable=SC2086,SC2155
-  local experimental_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu.experimental; print(dir(fbgemm_gpu.experimental))")
-  echo "################################################################################"
-  echo "[CHECK] FBGEMM_GPU Experimental Packages"
-  echo "[CHECK] fbgemm_gpu: ${fbgemm_gpu_packages}"
-  echo "[CHECK] fbgemm_gpu.experimental: ${experimental_packages}"
-  echo "################################################################################"
-  echo ""
-}
-
 __install_fetch_version_and_variant_info () {
   echo "[INSTALL] Checking imports and symbols ..."
   (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
@@ -62,6 +49,23 @@ __install_fetch_version_and_variant_info () {
   echo ""
 }
 
+__install_list_subpackages_info () {
+  # shellcheck disable=SC2086,SC2155
+  local fbgemm_gpu_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(dir(fbgemm_gpu))")
+
+  if [ "$installed_fbgemm_gpu_variant" == "cuda" ] || [ "$installed_fbgemm_gpu_variant" == "genai" ]; then
+    # shellcheck disable=SC2086,SC2155
+    local experimental_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu.experimental; print(dir(fbgemm_gpu.experimental))")
+  fi
+
+  echo "################################################################################"
+  echo "[CHECK] FBGEMM_GPU Experimental Packages"
+  echo "[CHECK] fbgemm_gpu: ${fbgemm_gpu_packages}"
+  echo "[CHECK] fbgemm_gpu.experimental: ${experimental_packages}"
+  echo "################################################################################"
+  echo ""
+}
+
 __install_check_operator_registrations () {
   echo "[INSTALL] Check for operator registrations ..."
   if [ "$installed_fbgemm_gpu_variant" == "genai" ]; then
@@ -103,12 +107,12 @@ __fbgemm_gpu_post_install_checks () {
   # Print PyTorch and CUDA versions for sanity check
   __install_print_dependencies_info
 
-  # List out FBGEMM_GPU subpackages
-  __install_list_subpackages_info
-
   # Fetch the version and variant info from the package
   __install_fetch_version_and_variant_info
 
+  # List out FBGEMM_GPU subpackages
+  __install_list_subpackages_info
+
   echo "[INSTALL] Check for installation of Python sources ..."
   if [ "$installed_fbgemm_gpu_variant" != "genai" ]; then
     (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 4b8a535447..7e167698db 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -74,6 +74,10 @@ __configure_fbgemm_gpu_test_cpu () {
 }
 
 __configure_fbgemm_gpu_test_cuda () {
+  # Disabled by default; enable for debugging
+  # shellcheck disable=SC2086
+  # print_exec conda env config vars set ${env_prefix} CUDA_LAUNCH_BLOCKING=1
+
   ignored_tests=(
     ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
   )
@@ -407,6 +411,11 @@ test_fbgemm_gpu_setup_and_pip_install () {
     echo "# Run Result              : $([ $retcode -eq 0 ] && echo "PASSED" || echo "FAILED")"
     echo "################################################################################"
 
+    if [ $retcode -eq 0 ]; then
+      # Clean out environment only if there were no errors
+      conda remove -n "$env_name" -y --all
+    fi
+
     cd - || return 1
     return $retcode
   }
diff --git a/.github/scripts/nova_dir.bash b/.github/scripts/nova_dir.bash
index 8d6c6f0eb8..a6ad595f26 100644
--- a/.github/scripts/nova_dir.bash
+++ b/.github/scripts/nova_dir.bash
@@ -17,4 +17,4 @@ export BUILD_FROM_NOVA=1
 if [[ "$CONDA_ENV" != "" ]]; then export CONDA_RUN="conda run --no-capture-output -p ${CONDA_ENV}" && echo "$CONDA_RUN"; fi
 if [[ "$CU_VERSION" == "cu118" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0' && echo "$TORCH_CUDA_ARCH_LIST"; fi
 if [[ "$CU_VERSION" == "cu121" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0;9.0;9.0a' && echo "$TORCH_CUDA_ARCH_LIST"; fi
-if [[ "$CU_VERSION" == "cu124" ]]; then export TORCH_CUDA_ARCH_LIST='8.0;9.0;9.0a' && echo "$TORCH_CUDA_ARCH_LIST"; fi
+if [[ "$CU_VERSION" == "cu124" ]]; then export TORCH_CUDA_ARCH_LIST='7.0;8.0;9.0;9.0a' && echo "$TORCH_CUDA_ARCH_LIST"; fi
diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
index a27949609d..cdb131f558 100644
--- a/fbgemm_gpu/setup.py
+++ b/fbgemm_gpu/setup.py
@@ -39,6 +39,11 @@ def from_args(cls, argv: List[str]):
             action="store_true",
             help="Print verbose logs during the build.",
         )
+        parser.add_argument(
+            "--debug",
+            action="store_true",
+            help="Enable DEBUG features in compilation such as PyTorch device-side assertions.",
+        )
         parser.add_argument(
             "--dryrun",
             action="store_true",
@@ -237,7 +242,7 @@ def _get_cxx11_abi():
             _get_cxx11_abi(),
         ]
 
-        cxx_args = []
+        cxx_flags = []
 
         if self.args.verbose:
             print("[SETUP.PY] Building in VERBOSE mode ...")
@@ -245,6 +250,11 @@ def _get_cxx11_abi():
                 ["-DCMAKE_VERBOSE_MAKEFILE=ON", "-DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE"]
             )
 
+        if self.args.debug:
+            # Enable device-side assertions in CUDA and HIP
+            # https://stackoverflow.com/questions/44284275/passing-compiler-options-in-cmake-command-line
+            cxx_flags.extend(["-DTORCH_USE_CUDA_DSA", "-DTORCH_USE_HIP_DSA"])
+
         if self.args.package_variant == "cpu":
             print("[SETUP.PY] Building the CPU-ONLY variant of FBGEMM_GPU ...")
             cmake_args.append("-DFBGEMM_CPU_ONLY=ON")
@@ -258,7 +268,7 @@ def _get_cxx11_abi():
 
         if self.args.nccl_lib_path:
             nccl_root = os.path.dirname(os.path.dirname(self.args.nccl_lib_path))
-            cxx_args.extend([f"-L{nccl_root}/lib"])
+            cxx_flags.extend([f"-L{nccl_root}/lib"])
             cmake_args.extend(
                 [
                     f"-DNCCL_INCLUDE_DIRS={nccl_root}/include",
@@ -270,7 +280,7 @@ def _get_cxx11_abi():
             print("[SETUP.PY] Setting CMake flags ...")
             path = self.args.cxxprefix
 
-            cxx_args.extend(
+            cxx_flags.extend(
                 [
                     "-fopenmp=libgomp",
                     "-stdlib=libstdc++",
@@ -286,8 +296,8 @@ def _get_cxx11_abi():
 
         cmake_args.extend(
             [
-                f"-DCMAKE_C_FLAGS='{' '.join(cxx_args)}'",
-                f"-DCMAKE_CXX_FLAGS='{' '.join(cxx_args)}'",
+                f"-DCMAKE_C_FLAGS='{' '.join(cxx_flags)}'",
+                f"-DCMAKE_CXX_FLAGS='{' '.join(cxx_flags)}'",
             ]
         )