NVIDIA · res-life · Nov 12, 2024 · Nov 13, 2024 · Nov 15, 2024 · Nov 18, 2024
diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh
@@ -364,6 +364,22 @@ EOF
     fi
     export PYSP_TEST_spark_rapids_memory_gpu_allocSize=${PYSP_TEST_spark_rapids_memory_gpu_allocSize:-'1536m'}
 
+    # Turns on $LOAD_HYBRID_BACKEND and setup the filepath of hybrid backend jars, to activate the
+    # hybrid backend while running subsequent integration tests.
+    if [[ "$LOAD_HYBRID_BACKEND" -eq 1 ]]; then
+      if [ -z "${HYBRID_BACKEND_JARS}" ]; then
+        echo "Error: Environment HYBRID_BACKEND_JARS is not set."
+        exit 1
+      fi
+      export PYSP_TEST_spark_jars="${PYSP_TEST_spark_jars},${HYBRID_BACKEND_JARS//:/,}"
+      export PYSP_TEST_spark_rapids_sql_parquet_useHybridReader=true
+      export PYSP_TEST_spark_rapids_sql_hybrid_loadBackend=true
+      export PYSP_TEST_spark_memory_offHeap_enabled=true
+      export PYSP_TEST_spark_memory_offHeap_size=512M
+      export PYSP_TEST_spark_rapids_sql_hybrid_load=true
+      export PYSP_TEST_spark_gluten_loadLibFromJar=true
+    fi
+
     SPARK_SHELL_SMOKE_TEST="${SPARK_SHELL_SMOKE_TEST:-0}"
     if [[ "${SPARK_SHELL_SMOKE_TEST}" != "0" ]]; then
         echo "Running spark-shell smoke test..."

diff --git a/integration_tests/src/main/python/marks.py b/integration_tests/src/main/python/marks.py
@@ -35,3 +35,4 @@
 pyarrow_test = pytest.mark.pyarrow_test
 datagen_overrides = pytest.mark.datagen_overrides
 tz_sensitive_test = pytest.mark.tz_sensitive_test
+hybrid_test = pytest.mark.hybrid_test
diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
@@ -1650,3 +1650,82 @@ def setup_table(spark):
     with_cpu_session(lambda spark: setup_table(spark))
     assert_gpu_and_cpu_are_equal_collect(lambda spark: spark.read.parquet(data_path).select("p"),
                                          conf={"spark.rapids.sql.columnSizeBytes": "100"})
+
+"""
+VeloxScan:
+1. DecimalType is NOT fully supported
+2. TimestampType can NOT be the KeyType of MapType
+3. NestedMap is disabled because it may produce incorrect result (usually occurring when table is very small)
+"""
+velox_gens = [
+    [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
+     string_gen, boolean_gen, date_gen,
+     TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc)),
+     ArrayGen(byte_gen),
+     ArrayGen(long_gen), ArrayGen(string_gen), ArrayGen(date_gen),
+     ArrayGen(TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc))),
+     ArrayGen(ArrayGen(byte_gen)),
+     StructGen([['child0', ArrayGen(byte_gen)], ['child1', byte_gen], ['child2', float_gen],
+                ['child3', decimal_gen_64bit]]),
+     ArrayGen(StructGen([['child0', string_gen], ['child1', double_gen], ['child2', int_gen]]))
+     ],
+    [MapGen(f(nullable=False), f()) for f in [
+        BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, DateGen]
+     ],
+    [simple_string_to_string_map_gen,
+     MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen),
+            max_length=10),
+     MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10),
+     # TODO: It seems that Velox Parquet Scan can NOT handle nested Map correctly
+     # MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)
+     ],
+]
+
+@pytest.mark.skipif(not is_hybrid_backend_loaded(), reason="HybridScan specialized tests")
+@pytest.mark.parametrize('parquet_gens', velox_gens, ids=idfn)
+@pytest.mark.parametrize('gen_rows', [20, 100, 512, 1024, 4096], ids=idfn)
+@hybrid_test
+def test_parquet_read_round_trip_hybrid(spark_tmp_path, parquet_gens, gen_rows):
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)]
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    with_cpu_session(
+        lambda spark: gen_df(spark, gen_list, length=gen_rows).write.parquet(data_path),
+        conf=rebase_write_corrected_conf)
+
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: spark.read.parquet(data_path),
+        conf={
+            'spark.sql.sources.useV1SourceList': 'parquet',
+            'spark.rapids.sql.parquet.useHybridReader': 'true',
+        })
+
+# Creating scenarios in which CoalesceConverter will coalesce several input batches by adjusting
+# reader_batch_size and coalesced_batch_size, tests if the CoalesceConverter functions correctly
+# when coalescing is needed.
+@pytest.mark.skipif(not is_hybrid_backend_loaded(), reason="HybridScan specialized tests")
+@pytest.mark.parametrize('reader_batch_size', [512, 1024, 2048], ids=idfn)
+@pytest.mark.parametrize('coalesced_batch_size', [1 << 25, 1 << 27], ids=idfn)
+@pytest.mark.parametrize('gen_rows', [8192, 10000], ids=idfn)
+@hybrid_test
+def test_parquet_read_round_trip_hybrid_multiple_batches(spark_tmp_path,
+                                                         reader_batch_size,
+                                                         coalesced_batch_size,
+                                                         gen_rows):
+    gens = []
+    for g in velox_gens:
+        gens.extend(g)
+
+    gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)]
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+    with_cpu_session(
+        lambda spark: gen_df(spark, gen_list, length=gen_rows).write.parquet(data_path),
+        conf=rebase_write_corrected_conf)
+
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: spark.read.parquet(data_path),
+        conf={
+            'spark.sql.sources.useV1SourceList': 'parquet',
+            'spark.rapids.sql.parquet.useHybridReader': 'true',
+            'spark.gluten.sql.columnar.maxBatchSize': reader_batch_size,
+            'spark.rapids.sql.batchSizeBytes': coalesced_batch_size,
+        })
diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py
@@ -327,3 +327,6 @@ def is_hive_available():
     if is_at_least_precommit_run():
         return True
     return _spark.conf.get("spark.sql.catalogImplementation") == "hive"
+
+def is_hybrid_backend_loaded():
+    return _spark.conf.get("spark.rapids.sql.hybrid.load") == "true"
@@ -102,6 +102,12 @@
             <artifactId>mockito-core</artifactId>
             <scope>test</scope>
         </dependency>
+
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-hybrid_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+        </dependency>
     </dependencies>
     <!-- #if scala-2.13 --><!--
     <profiles>

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetScan.scala
@@ -2723,6 +2723,12 @@ case class ParquetTableReader(
   }
 
   override def close(): Unit = {
+    debugDumpPrefix.foreach { prefix =>
+      if (debugDumpAlways) {
+        val p = DumpUtils.dumpBuffer(conf, buffer, offset, len, prefix, ".parquet")
+        logWarning(s"Wrote data for $splitsString to $p")
+      }
+    }
     reader.close()
     buffer.close()
   }

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.rapids.execution.TrampolineUtil
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
-private class GpuRowToColumnConverter(schema: StructType) extends Serializable {
+class GpuRowToColumnConverter(schema: StructType) extends Serializable {
   private val converters = schema.fields.map {
     f => GpuRowToColumnConverter.getConverterForType(f.dataType, f.nullable)
   }
@@ -594,7 +594,8 @@ class RowToColumnarIterator(
     numOutputRows: GpuMetric = NoopMetric,
     numOutputBatches: GpuMetric = NoopMetric,
     streamTime: GpuMetric = NoopMetric,
-    opTime: GpuMetric = NoopMetric) extends Iterator[ColumnarBatch] {
+    opTime: GpuMetric = NoopMetric,
+    acquireGpuTime: GpuMetric = NoopMetric) extends Iterator[ColumnarBatch] {
 
   private val targetSizeBytes = localGoal.targetSizeBytes
   private var targetRows = 0
@@ -650,7 +651,11 @@ class RowToColumnarIterator(
         // note that TaskContext.get() can return null during unit testing so we wrap it in an
         // option here
         Option(TaskContext.get())
-            .foreach(ctx => GpuSemaphore.acquireIfNecessary(ctx))
+          .foreach { ctx =>
+            val acquireGpuStart = System.nanoTime()
+            GpuSemaphore.acquireIfNecessary(ctx)
+            acquireGpuTime += System.nanoTime() - acquireGpuStart
+          }
 
         val ret = withResource(new NvtxWithMetrics("RowToColumnar", NvtxColor.GREEN,
             opTime)) { _ =>

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -31,6 +31,7 @@ import com.nvidia.spark.DFUDFPlugin
 import com.nvidia.spark.rapids.RapidsConf.AllowMultipleJars
 import com.nvidia.spark.rapids.RapidsPluginUtils.buildInfoEvent
 import com.nvidia.spark.rapids.filecache.{FileCache, FileCacheLocalityManager, FileCacheLocalityMsg}
+import com.nvidia.spark.rapids.hybrid.HybridPluginWrapper
 import com.nvidia.spark.rapids.jni.GpuTimeZoneDB
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 import org.apache.commons.lang3.exception.ExceptionUtils
@@ -98,7 +99,8 @@ object RapidsPluginUtils extends Logging {
       s"private revision ${privateRev}")
   }
 
-  val extraPlugins = getExtraPlugins
+  val extraPlugins = getExtraPlugins ++
+      Seq(new HybridPluginWrapper().asInstanceOf[SparkPlugin])
 
   def logPluginMode(conf: RapidsConf): Unit = {
     if (conf.isSqlEnabled && conf.isSqlExecuteOnGPU) {

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -28,6 +28,7 @@ import com.nvidia.spark.rapids.lore.{LoreId, OutputLoreId}
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.Logging
 import org.apache.spark.network.util.{ByteUnit, JavaUtils}
+import org.apache.spark.rapids.hybrid.HybridBackend
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.RapidsPrivateUtil
@@ -1688,6 +1689,19 @@ val GPU_COREDUMP_PIPE_PATTERN = conf("spark.rapids.gpu.coreDump.pipePattern")
     .booleanConf
     .createWithDefault(false)
 
+  val HYBRID_PARQUET_READER = conf("spark.rapids.sql.parquet.useHybridReader")
+    .doc("Use HybridScan to read Parquet data via CPUs")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
+  // spark.rapids.sql.hybrid.loadBackend defined at HybridPluginWrapper of spark-rapids-private
+  val LOAD_HYBRID_BACKEND = conf(HybridBackend.LOAD_BACKEND_KEY)
+    .doc("Load hybrid backend as an extra plugin of spark-rapids during launch time")
+    .startupOnly()
+    .booleanConf
+    .createWithDefault(false)
+
   val HASH_AGG_REPLACE_MODE = conf("spark.rapids.sql.hashAgg.replaceMode")
     .doc("Only when hash aggregate exec has these modes (\"all\" by default): " +
       "\"all\" (try to replace all aggregates, default), " +
@@ -2829,6 +2843,10 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val avroDebugDumpAlways: Boolean = get(AVRO_DEBUG_DUMP_ALWAYS)
 
+  lazy val useHybridParquetReader: Boolean = get(HYBRID_PARQUET_READER)
+
+  lazy val loadHybridBackend: Boolean = get(LOAD_HYBRID_BACKEND)
+
   lazy val hashAggReplaceMode: String = get(HASH_AGG_REPLACE_MODE)
 
   lazy val partialMergeDistinctEnabled: Boolean = get(PARTIAL_MERGE_DISTINCT_ENABLED)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/rapids/hybrid/CoalesceConvertIterator.scala b/sql-plugin/src/main/scala/org/apache/spark/rapids/hybrid/CoalesceConvertIterator.scala
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rapids.hybrid
+
+import ai.rapids.cudf.NvtxColor
+import com.nvidia.spark.rapids.{AcquireFailed, GpuColumnVector, GpuMetric, GpuSemaphore, NvtxWithMetrics}
+import com.nvidia.spark.rapids.Arm._
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.hybrid.{CoalesceBatchConverter => NativeConverter}
+import com.nvidia.spark.rapids.hybrid.RapidsHostColumn
+
+import org.apache.spark.TaskContext
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
+
+class CoalesceConvertIterator(veloxIter: Iterator[ColumnarBatch],
+                              targetBatchSizeInBytes: Int,
+                              schema: StructType,
+                              metrics: Map[String, GpuMetric])
+  extends Iterator[Array[RapidsHostColumn]] with Logging {
+
+  private var converterImpl: Option[NativeConverter] = None
+
+  private var srcExhausted = false
+
+  private val converterMetrics = Map(
+    "C2COutputSize" -> GpuMetric.unwrap(metrics("C2COutputSize")))
+
+  override def hasNext(): Boolean = {
+    // either converter holds data or upstreaming iterator holds data
+    val ret = withResource(new NvtxWithMetrics("VeloxC2CHasNext", NvtxColor.WHITE,
+      metrics("C2CStreamTime"))) { _ =>
+      converterImpl.exists(c => c.isDeckFilled || c.hasProceedingBuilders) ||
+        (!srcExhausted && veloxIter.hasNext)
+    }
+    if (!ret) {
+      if (!srcExhausted) {
+        srcExhausted = true
+      }
+      converterImpl.foreach { c =>
+        // VeloxBatchConverter collects the eclipsedTime of C2C_Conversion by itself.
+        // Here we fetch the final value before closing it.
+        metrics("C2CTime") += c.eclipsedNanoSecond
+        // release the native instance when upstreaming iterator has been exhausted
+        val detailedMetrics = c.close()
+        val tID = TaskContext.get().taskAttemptId()
+        logError(s"task[$tID] CoalesceNativeConverter finished:\n$detailedMetrics")
+        converterImpl = None
+      }
+    }
+    ret
+  }
+
+  override def next(): Array[RapidsHostColumn] = {
+    val ntvx = new NvtxWithMetrics("VeloxC2CNext", NvtxColor.YELLOW, metrics("C2CStreamTime"))
+    withResource(ntvx) { _ =>
+      while (true) {
+        converterImpl.foreach { impl =>
+          val needFlush = if (veloxIter.hasNext) {
+            // The only condition leading to a nonEmpty deck is targetBuffers are unset after
+            // the previous flushing
+            if (impl.isDeckFilled) {
+              impl.setupTargetVectors()
+            }
+            // tryAppendBatch, if failed, the batch will be placed on the deck
+            metrics("CpuReaderBatches") += 1
+            !impl.tryAppendBatch(veloxIter.next())
+          } else {
+            srcExhausted = true
+            true
+          }
+          if (needFlush) {
+            metrics("CoalescedBatches") += 1
+            val rapidsHostBatch = impl.flush()
+            // It is essential to check and tidy up the deck right after flushing. Because if
+            // the next call of veloxIter.hasNext will release the batch which the deck holds
+            // its reference.
+            if (impl.isDeckFilled) {
+              impl.setupTargetVectors()
+            }
+            return rapidsHostBatch
+          }
+        }
+        if (converterImpl.isEmpty) {
+          val converter = NativeConverter(
+            veloxIter.next(),
+            targetBatchSizeInBytes, schema, converterMetrics
+          )
+          converterImpl = Some(converter)
+        }
+      }
+
+      throw new RuntimeException("should NOT reach this line")
+    }
+  }
+
+}
+
+object CoalesceConvertIterator extends Logging {
+
+  def hostToDevice(hostIter: Iterator[Array[RapidsHostColumn]],
+                   outputAttr: Seq[Attribute],
+                   metrics: Map[String, GpuMetric]): Iterator[ColumnarBatch] = {
+    val dataTypes = outputAttr.map(_.dataType).toArray
+
+    hostIter.map { hostVectors =>
+      Option(TaskContext.get()).foreach { ctx =>
+        GpuSemaphore.tryAcquire(ctx) match {
+          case AcquireFailed(_) =>
+            withResource(new NvtxWithMetrics("gpuAcquireC2C", NvtxColor.GREEN,
+              metrics("GpuAcquireTime"))) { _ =>
+              GpuSemaphore.acquireIfNecessary(ctx)
+            }
+          case _ =>
+        }
+      }
+
+      val deviceVectors: Array[ColumnVector] = hostVectors.zip(dataTypes).safeMap {
+        case (RapidsHostColumn(hcv, isPinned, totalBytes), dt) =>
+          val nvtxMetric = if (isPinned) {
+            metrics("PinnedH2DSize") += totalBytes
+            new NvtxWithMetrics("pinnedH2D", NvtxColor.DARK_GREEN, metrics("PinnedH2DTime"))
+          } else {
+            metrics("PageableH2DSize") += totalBytes
+            new NvtxWithMetrics("PageableH2D", NvtxColor.GREEN, metrics("PageableH2DTime"))
+          }
+          withResource(hcv) { _ =>
+            withResource(nvtxMetric) { _ =>
+              GpuColumnVector.from(hcv.copyToDevice(), dt)
+            }
+          }
+      }
+
+      new ColumnarBatch(deviceVectors, hostVectors.head.vector.getRowCount.toInt)
+    }
+  }
+
+}