diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index c209a7109cd..f049bad0a25 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-24.02 + - branch-24.04 types: [closed] jobs: @@ -29,13 +29,13 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: branch-24.02 # force to fetch from latest upstream instead of PR ref + ref: branch-24.04 # force to fetch from latest upstream instead of PR ref - name: auto-merge job uses: ./.github/workflows/auto-merge env: OWNER: NVIDIA REPO_NAME: spark-rapids - HEAD: branch-24.02 - BASE: branch-24.04 + HEAD: branch-24.04 + BASE: branch-24.06 AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 9ef932d92f5..8cd7a2fe6a6 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -56,7 +56,6 @@ jobs: wbo4958,\ wjxiz1992,\ sperlingxx,\ - pxLi,\ hyperbolic2346,\ gerashegalov,\ ttnghia,\ @@ -72,6 +71,7 @@ jobs: winningsix,\ viadea,\ yinqingh,\ + parthosa,\ ', format('{0},', github.actor)) && github.event.comment.body == 'build' steps: - name: Check if comment is issued by authorized person diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 7d00dca36c5..ad5c3647398 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -176,7 +176,7 @@ jobs: max_retry=3; delay=30; i=1 while true; do mvn package \ - -pl integration_tests,tests -am -P 'individual,pre-merge' \ + -pl integration_tests,tests,tools -am -P 'individual,pre-merge' \ -Dbuildver=${{ matrix.spark-version }} -Dmaven.scalastyle.skip=true \ -Drat.skip=true ${{ env.COMMON_MVN_FLAGS }} && break || { if [[ $i -le $max_retry ]]; then @@ -235,7 +235,7 @@ jobs: max_retry=3; delay=30; i=1 while true; do mvn package \ - -pl integration_tests,tests -am -P 'individual,pre-merge' \ + -pl integration_tests,tests,tools -am -P 'individual,pre-merge' \ -Dbuildver=${{ matrix.spark-version }} -Dmaven.scalastyle.skip=true \ -Drat.skip=true ${{ env.COMMON_MVN_FLAGS }} && break || { if [[ $i -le $max_retry ]]; then diff --git a/CHANGELOG.md b/CHANGELOG.md index 42324d41aa8..35e8d00ebe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,173 @@ # Change log -Generated on 2024-03-06 +Generated on 2024-04-15 + +## Release 24.04 + +### Features +||| +|:---|:---| +|[#10263](https://github.com/NVIDIA/spark-rapids/issues/10263)|[FEA] Add support for reading JSON containing structs where rows are not consistent| +|[#10436](https://github.com/NVIDIA/spark-rapids/issues/10436)|[FEA] Move Spark 3.5.1 out of snapshot once released| +|[#10430](https://github.com/NVIDIA/spark-rapids/issues/10430)|[FEA] Error out when running on an unsupported GPU architecture| +|[#9750](https://github.com/NVIDIA/spark-rapids/issues/9750)|[FEA] Review `JsonToStruct` and `JsonScan` and consolidate some testing and implementation| +|[#8680](https://github.com/NVIDIA/spark-rapids/issues/8680)|[AUDIT][SPARK-42779][SQL] Allow V2 writes to indicate advisory shuffle partition size| +|[#10429](https://github.com/NVIDIA/spark-rapids/issues/10429)|[FEA] Drop support for Databricks 10.4 ML LTS| +|[#10334](https://github.com/NVIDIA/spark-rapids/issues/10334)|[FEA] Turn on memory limits for parquet reader| +|[#10344](https://github.com/NVIDIA/spark-rapids/issues/10344)|[FEA] support barrier mode for mapInPandas/mapInArrow| + +### Performance +||| +|:---|:---| +|[#10578](https://github.com/NVIDIA/spark-rapids/issues/10578)|[FEA] Support project expression rewrite for the case ```stringinstr(str_col, substr) > 0``` to ```contains(str_col, substr)```| +|[#10570](https://github.com/NVIDIA/spark-rapids/issues/10570)|[FEA] See if we can optimize sort for a single batch| +|[#10531](https://github.com/NVIDIA/spark-rapids/issues/10531)|[FEA] Support "WindowGroupLimit" optimization on GPU for Databricks 13.3 ML LTS+| +|[#5553](https://github.com/NVIDIA/spark-rapids/issues/5553)|[FEA][Audit] - Push down StringEndsWith/Contains to Parquet | +|[#8208](https://github.com/NVIDIA/spark-rapids/issues/8208)|[FEA][AUDIT][SPARK-37099][SQL] Introduce the group limit of Window for rank-based filter to optimize top-k computation| +|[#10249](https://github.com/NVIDIA/spark-rapids/issues/10249)|[FEA] Support common subexpression elimination for expand operator| +|[#10301](https://github.com/NVIDIA/spark-rapids/issues/10301)|[FEA] Improve performance of from_json| + +### Bugs Fixed +||| +|:---|:---| +|[#10665](https://github.com/NVIDIA/spark-rapids/issues/10665)|[BUG] Need to update private jar's version to v24.04.1 for spark-rapids v24.04.0 release| +|[#10589](https://github.com/NVIDIA/spark-rapids/issues/10589)|[BUG] ZSTD version mismatch in integration tests| +|[#10255](https://github.com/NVIDIA/spark-rapids/issues/10255)|[BUG] parquet_tests are skipped on Dataproc CI| +|[#10624](https://github.com/NVIDIA/spark-rapids/issues/10624)|[BUG] Deploy script "gpg:sign-and-deploy-file failed: 401 Unauthorized| +|[#10631](https://github.com/NVIDIA/spark-rapids/issues/10631)|[BUG] pending `BlockState` leaks blocks if the shuffle read doesn't finish successfully| +|[#10349](https://github.com/NVIDIA/spark-rapids/issues/10349)|[BUG]Test in json_test.py failed: test_from_json_struct_decimal| +|[#9033](https://github.com/NVIDIA/spark-rapids/issues/9033)|[BUG] GpuGetJsonObject does not expand escaped characters| +|[#10216](https://github.com/NVIDIA/spark-rapids/issues/10216)|[BUG] GetJsonObject fails at spark unit test $.store.book[*].reader| +|[#10217](https://github.com/NVIDIA/spark-rapids/issues/10217)|[BUG] GetJsonObject fails at spark unit test $.store.basket[0][*].b| +|[#10537](https://github.com/NVIDIA/spark-rapids/issues/10537)|[BUG] GetJsonObject throws exception when json path contains a name starting with `'`| +|[#10194](https://github.com/NVIDIA/spark-rapids/issues/10194)|[BUG] GetJsonObject does not validate the input is JSON in the same way as Spark| +|[#10196](https://github.com/NVIDIA/spark-rapids/issues/10196)|[BUG] GetJsonObject does not process escape sequences in returned strings or queries| +|[#10212](https://github.com/NVIDIA/spark-rapids/issues/10212)|[BUG] GetJsonObject should return null for invalid query instead of throwing an exception| +|[#10218](https://github.com/NVIDIA/spark-rapids/issues/10218)|[BUG] GetJsonObject does not normalize non-string output| +|[#10591](https://github.com/NVIDIA/spark-rapids/issues/10591)|[BUG] `test_column_add_after_partition` failed on EGX Standalone cluster| +|[#10277](https://github.com/NVIDIA/spark-rapids/issues/10277)|Add monitoring for GH action deprecations| +|[#10627](https://github.com/NVIDIA/spark-rapids/issues/10627)|[BUG] Integration tests FAILED on: "nvCOMP 2.3/2.4 or newer is required for Zstandard compression"| +|[#10585](https://github.com/NVIDIA/spark-rapids/issues/10585)|[BUG]Test simple pinned blocking alloc Failed nightly tests| +|[#10586](https://github.com/NVIDIA/spark-rapids/issues/10586)|[BUG] YARN EGX IT build failing parquet_testing_test can't find file| +|[#10133](https://github.com/NVIDIA/spark-rapids/issues/10133)|[BUG] test_hash_reduction_collect_set_on_nested_array_type failed in a distributed environment| +|[#10378](https://github.com/NVIDIA/spark-rapids/issues/10378)|[BUG] `test_range_running_window_float_decimal_sum_runs_batched` fails intermittently| +|[#10486](https://github.com/NVIDIA/spark-rapids/issues/10486)|[BUG] StructsToJson does not fall back to the CPU for unsupported timeZone options| +|[#10484](https://github.com/NVIDIA/spark-rapids/issues/10484)|[BUG] JsonToStructs does not fallback when columnNameOfCorruptRecord is set| +|[#10460](https://github.com/NVIDIA/spark-rapids/issues/10460)|[BUG] JsonToStructs should reject float numbers for integer types| +|[#10468](https://github.com/NVIDIA/spark-rapids/issues/10468)|[BUG] JsonToStructs and ScanJson should not treat quoted strings as valid integers| +|[#10470](https://github.com/NVIDIA/spark-rapids/issues/10470)|[BUG] ScanJson and JsonToStructs should support parsing quoted decimal strings that are formatted by local (at least for en-US)| +|[#10494](https://github.com/NVIDIA/spark-rapids/issues/10494)|[BUG] JsonToStructs parses INF wrong when nonNumericNumbers is enabled| +|[#10456](https://github.com/NVIDIA/spark-rapids/issues/10456)|[BUG] allowNonNumericNumbers OFF supported for JSON Scan, but not JsonToStructs| +|[#10467](https://github.com/NVIDIA/spark-rapids/issues/10467)|[BUG] JsonToStructs should reject 1. as a valid number| +|[#10469](https://github.com/NVIDIA/spark-rapids/issues/10469)|[BUG] ScanJson should accept "1." as a valid Decimal| +|[#10559](https://github.com/NVIDIA/spark-rapids/issues/10559)|[BUG] test_spark_from_json_date_with_format FAILED on : Part of the plan is not columnar class org.apache.spark.sql.execution.ProjectExec| +|[#10209](https://github.com/NVIDIA/spark-rapids/issues/10209)|[BUG] Test failure hash_aggregate_test.py::test_hash_reduction_collect_set_on_nested_array_type DATAGEN_SEED=1705515231| +|[#10319](https://github.com/NVIDIA/spark-rapids/issues/10319)|[BUG] Shuffled join OOM with 4GB of GPU memory| +|[#10507](https://github.com/NVIDIA/spark-rapids/issues/10507)|[BUG] regexp_test.py FAILED test_regexp_extract_all_idx_positive[DATAGEN_SEED=1709054829, INJECT_OOM]| +|[#10527](https://github.com/NVIDIA/spark-rapids/issues/10527)|[BUG] Build on Databricks failed with GpuGetJsonObject.scala:19: object parsing is not a member of package util| +|[#10509](https://github.com/NVIDIA/spark-rapids/issues/10509)|[BUG] scalar leaks when running nds query51| +|[#10214](https://github.com/NVIDIA/spark-rapids/issues/10214)|[BUG] GetJsonObject does not support unquoted array like notation| +|[#10215](https://github.com/NVIDIA/spark-rapids/issues/10215)|[BUG] GetJsonObject removes leading space characters| +|[#10213](https://github.com/NVIDIA/spark-rapids/issues/10213)|[BUG] GetJsonObject supports array index notation without a root| +|[#10452](https://github.com/NVIDIA/spark-rapids/issues/10452)|[BUG] JsonScan and from_json share fallback checks, but have hard coded names in the results| +|[#10455](https://github.com/NVIDIA/spark-rapids/issues/10455)|[BUG] JsonToStructs and ScanJson do not fall back/support it properly if single quotes are disabled| +|[#10219](https://github.com/NVIDIA/spark-rapids/issues/10219)|[BUG] GetJsonObject sees a double quote in a single quoted string as invalid| +|[#10431](https://github.com/NVIDIA/spark-rapids/issues/10431)|[BUG] test_casting_from_overflow_double_to_timestamp `DID NOT RAISE `| +|[#10499](https://github.com/NVIDIA/spark-rapids/issues/10499)|[BUG] Unit tests core dump as below| +|[#9325](https://github.com/NVIDIA/spark-rapids/issues/9325)|[BUG] test_csv_infer_schema_timestamp_ntz fails| +|[#10422](https://github.com/NVIDIA/spark-rapids/issues/10422)|[BUG] test_get_json_object_single_quotes failure| +|[#10411](https://github.com/NVIDIA/spark-rapids/issues/10411)|[BUG] Some fast parquet tests fail if the time zone is not UTC| +|[#10410](https://github.com/NVIDIA/spark-rapids/issues/10410)|[BUG]delta_lake_update_test.py::test_delta_update_partitions[['a', 'b']-False] failed by DATAGEN_SEED=1707683137| +|[#10404](https://github.com/NVIDIA/spark-rapids/issues/10404)|[BUG] GpuJsonTuple memory leak| +|[#10382](https://github.com/NVIDIA/spark-rapids/issues/10382)|[BUG] Complile failed on branch-24.04 : literals.scala:32: object codec is not a member of package org.apache.commons| + +### PRs +||| +|:---|:---| +|[#10681](https://github.com/NVIDIA/spark-rapids/pull/10681)|Update rapids JNI dependency to 24.04.0, private to 24.04.1| +|[#10660](https://github.com/NVIDIA/spark-rapids/pull/10660)|Ensure an executor broadcast is in a single batch| +|[#10676](https://github.com/NVIDIA/spark-rapids/pull/10676)|[DOC] Update docs for 24.04.0 release [skip ci]| +|[#10654](https://github.com/NVIDIA/spark-rapids/pull/10654)|Add a config to switch back to old impl for getJsonObject| +|[#10667](https://github.com/NVIDIA/spark-rapids/pull/10667)|Update rapids private dependency to 24.04.1| +|[#10664](https://github.com/NVIDIA/spark-rapids/pull/10664)|Remove build link from the premerge-CI workflow| +|[#10657](https://github.com/NVIDIA/spark-rapids/pull/10657)|Revert "Host Memory OOM handling for RowToColumnarIterator (#10617)"| +|[#10625](https://github.com/NVIDIA/spark-rapids/pull/10625)|Pin to 3.1.0 maven-gpg-plugin in deploy script [skip ci]| +|[#10637](https://github.com/NVIDIA/spark-rapids/pull/10637)|Cleanup async state when multi-threaded shuffle readers fail| +|[#10617](https://github.com/NVIDIA/spark-rapids/pull/10617)|Host Memory OOM handling for RowToColumnarIterator| +|[#10614](https://github.com/NVIDIA/spark-rapids/pull/10614)|Use random seed for `test_from_json_struct_decimal`| +|[#10581](https://github.com/NVIDIA/spark-rapids/pull/10581)|Use new jni kernel for getJsonObject| +|[#10630](https://github.com/NVIDIA/spark-rapids/pull/10630)|Fix removal of internal metadata information in 350 shim| +|[#10623](https://github.com/NVIDIA/spark-rapids/pull/10623)|Auto merge PRs to branch-24.06 from branch-24.04 [skip ci]| +|[#10616](https://github.com/NVIDIA/spark-rapids/pull/10616)|Pass metadata extractors to FileScanRDD| +|[#10620](https://github.com/NVIDIA/spark-rapids/pull/10620)|Remove unused shared lib in Jenkins files| +|[#10615](https://github.com/NVIDIA/spark-rapids/pull/10615)|Turn off state logging in HostAllocSuite| +|[#10610](https://github.com/NVIDIA/spark-rapids/pull/10610)|Do not replace TableCacheQueryStageExec| +|[#10599](https://github.com/NVIDIA/spark-rapids/pull/10599)|Call globStatus directly via PY4J in hdfs_glob to avoid calling hadoop command| +|[#10602](https://github.com/NVIDIA/spark-rapids/pull/10602)|Remove InMemoryTableScanExec support for Spark 3.5+| +|[#10608](https://github.com/NVIDIA/spark-rapids/pull/10608)|Update perfio.s3.enabled doc to fix build failure [skip ci]| +|[#10598](https://github.com/NVIDIA/spark-rapids/pull/10598)|Update CI script to build and deploy using the same CUDA classifier[skip ci]| +|[#10575](https://github.com/NVIDIA/spark-rapids/pull/10575)|Update JsonToStructs and ScanJson to have white space normalization| +|[#10597](https://github.com/NVIDIA/spark-rapids/pull/10597)|add guardword to hide cloud info| +|[#10540](https://github.com/NVIDIA/spark-rapids/pull/10540)|Handle minimum GPU architecture supported| +|[#10584](https://github.com/NVIDIA/spark-rapids/pull/10584)|Add in small optimization for instr comparison| +|[#10590](https://github.com/NVIDIA/spark-rapids/pull/10590)|Turn on transition logging in HostAllocSuite| +|[#10572](https://github.com/NVIDIA/spark-rapids/pull/10572)|Improve performance of Sort for the common single batch use case| +|[#10568](https://github.com/NVIDIA/spark-rapids/pull/10568)|Add configuration to share JNI pinned pool with cuIO| +|[#10550](https://github.com/NVIDIA/spark-rapids/pull/10550)|Enable window-group-limit optimization on| +|[#10542](https://github.com/NVIDIA/spark-rapids/pull/10542)|Make JSON parsing common between JsonToStructs and ScanJson| +|[#10562](https://github.com/NVIDIA/spark-rapids/pull/10562)|Fix test_spark_from_json_date_with_format when run in a non-UTC TZ| +|[#10564](https://github.com/NVIDIA/spark-rapids/pull/10564)|Enable specifying specific integration test methods via TESTS environment| +|[#10563](https://github.com/NVIDIA/spark-rapids/pull/10563)|Append new authorized user to blossom-ci safelist [skip ci]| +|[#10520](https://github.com/NVIDIA/spark-rapids/pull/10520)|Distinct left join| +|[#10538](https://github.com/NVIDIA/spark-rapids/pull/10538)|Move K8s cloud name into common lib for Jenkins CI| +|[#10552](https://github.com/NVIDIA/spark-rapids/pull/10552)|Fix issues when no value can be extracted from a regular expression| +|[#10522](https://github.com/NVIDIA/spark-rapids/pull/10522)|Fix missing scala-parser-combinators dependency on Databricks| +|[#10549](https://github.com/NVIDIA/spark-rapids/pull/10549)|Update to latest branch-24.02 [skip ci]| +|[#10544](https://github.com/NVIDIA/spark-rapids/pull/10544)|Fix merge conflict from branch-24.02| +|[#10503](https://github.com/NVIDIA/spark-rapids/pull/10503)|Distinct inner join| +|[#10512](https://github.com/NVIDIA/spark-rapids/pull/10512)|Move to parsing from_json input preserving quoted strings.| +|[#10528](https://github.com/NVIDIA/spark-rapids/pull/10528)|Fix auto merge conflict 10523| +|[#10519](https://github.com/NVIDIA/spark-rapids/pull/10519)|Replicate HostColumnVector.ColumnBuilder in plugin to enable host memory oom work| +|[#10521](https://github.com/NVIDIA/spark-rapids/pull/10521)|Fix Spark 3.5.1 build| +|[#10516](https://github.com/NVIDIA/spark-rapids/pull/10516)|One more metric for expand| +|[#10500](https://github.com/NVIDIA/spark-rapids/pull/10500)|Support "WindowGroupLimit" optimization on GPU| +|[#10508](https://github.com/NVIDIA/spark-rapids/pull/10508)|Move 351 shims into noSnapshot buildvers| +|[#10510](https://github.com/NVIDIA/spark-rapids/pull/10510)|Fix scalar leak in SumBinaryFixer| +|[#10466](https://github.com/NVIDIA/spark-rapids/pull/10466)|Use parser from spark to normalize json path in GetJsonObject| +|[#10490](https://github.com/NVIDIA/spark-rapids/pull/10490)|Start working on a more complete json test matrix json| +|[#10497](https://github.com/NVIDIA/spark-rapids/pull/10497)|Add minValue overflow check in ORC double-to-timestamp cast| +|[#10501](https://github.com/NVIDIA/spark-rapids/pull/10501)|Fix scalar leak in WindowRetrySuite| +|[#10474](https://github.com/NVIDIA/spark-rapids/pull/10474)|Remove Support for Databricks 10.4| +|[#10418](https://github.com/NVIDIA/spark-rapids/pull/10418)|Enable GpuShuffledSymmetricHashJoin by default| +|[#10450](https://github.com/NVIDIA/spark-rapids/pull/10450)|Improve internal row to columnar host memory by using a combined spillable buffer| +|[#10440](https://github.com/NVIDIA/spark-rapids/pull/10440)|Generate CSV data per Spark version for tools| +|[#10449](https://github.com/NVIDIA/spark-rapids/pull/10449)|[DOC] Fix table rendering issue in github.io download UI page [skip ci]| +|[#10438](https://github.com/NVIDIA/spark-rapids/pull/10438)|Integrate perfio.s3 reader| +|[#10423](https://github.com/NVIDIA/spark-rapids/pull/10423)|Disable Integration Test:`test_get_json_object_single_quotes` on DB 10.4| +|[#10419](https://github.com/NVIDIA/spark-rapids/pull/10419)|Export TZ in tests when default TZ is used| +|[#10426](https://github.com/NVIDIA/spark-rapids/pull/10426)|Fix auto merge conflict 10425 [skip ci]| +|[#10427](https://github.com/NVIDIA/spark-rapids/pull/10427)|Update test doc for 24.04 [skip ci]| +|[#10396](https://github.com/NVIDIA/spark-rapids/pull/10396)|Remove inactive user from github workflow [skip ci]| +|[#10421](https://github.com/NVIDIA/spark-rapids/pull/10421)|Use withRetry when manifesting spillable batch in GpuShuffledHashJoinExec| +|[#10420](https://github.com/NVIDIA/spark-rapids/pull/10420)|Disable JsonTuple by default| +|[#10407](https://github.com/NVIDIA/spark-rapids/pull/10407)|Enable Single Quote Support in getJSONObject API with GetJsonObjectOptions| +|[#10415](https://github.com/NVIDIA/spark-rapids/pull/10415)|Avoid comparing Delta logs when writing partitioned tables| +|[#10247](https://github.com/NVIDIA/spark-rapids/pull/10247)|Improve `GpuExpand` by pre-projecting some columns| +|[#10248](https://github.com/NVIDIA/spark-rapids/pull/10248)|Group-by aggregation based optimization for UNBOUNDED `collect_set` window function| +|[#10406](https://github.com/NVIDIA/spark-rapids/pull/10406)|Enabled subPage chunking by default| +|[#10361](https://github.com/NVIDIA/spark-rapids/pull/10361)|Add in basic support for JSON generation in BigDataGen and improve performance of from_json| +|[#10158](https://github.com/NVIDIA/spark-rapids/pull/10158)|Add in framework for unbounded to unbounded window agg optimization| +|[#10394](https://github.com/NVIDIA/spark-rapids/pull/10394)|Fix auto merge conflict 10393 [skip ci]| +|[#10375](https://github.com/NVIDIA/spark-rapids/pull/10375)|Support barrier mode for mapInPandas/mapInArrow| +|[#10356](https://github.com/NVIDIA/spark-rapids/pull/10356)|Update locate_parquet_testing_files function to support hdfs input path for dataproc CI| +|[#10369](https://github.com/NVIDIA/spark-rapids/pull/10369)|Revert "Support barrier mode for mapInPandas/mapInArrow (#10364)"| +|[#10358](https://github.com/NVIDIA/spark-rapids/pull/10358)|Disable Spark UI by default for integration tests| +|[#10360](https://github.com/NVIDIA/spark-rapids/pull/10360)|Fix a memory leak in json tuple| +|[#10364](https://github.com/NVIDIA/spark-rapids/pull/10364)|Support barrier mode for mapInPandas/mapInArrow| +|[#10348](https://github.com/NVIDIA/spark-rapids/pull/10348)|Remove redundant joinOutputRows metric| +|[#10321](https://github.com/NVIDIA/spark-rapids/pull/10321)|Bump up dependency version to 24.04.0-SNAPSHOT| +|[#10330](https://github.com/NVIDIA/spark-rapids/pull/10330)|Add tryAcquire to GpuSemaphore| +|[#10331](https://github.com/NVIDIA/spark-rapids/pull/10331)|Revert "Update to libcudf unsigned sum aggregation types change (#10267)"| +|[#10258](https://github.com/NVIDIA/spark-rapids/pull/10258)|Init project version 24.04.0-SNAPSHOT| ## Release 24.02 @@ -124,6 +292,7 @@ Generated on 2024-03-06 ### PRs ||| |:---|:---| +|[#10555](https://github.com/NVIDIA/spark-rapids/pull/10555)|Update change log [skip ci]| |[#10551](https://github.com/NVIDIA/spark-rapids/pull/10551)|Try to make degenerative joins here impossible for these tests| |[#10546](https://github.com/NVIDIA/spark-rapids/pull/10546)|Update changelog [skip ci]| |[#10541](https://github.com/NVIDIA/spark-rapids/pull/10541)|Fix Delta log cache size settings during integration tests| @@ -143,6 +312,7 @@ Generated on 2024-03-06 |[#10387](https://github.com/NVIDIA/spark-rapids/pull/10387)|[DOC] Update docs for 24.02.0 release [skip ci]| |[#10399](https://github.com/NVIDIA/spark-rapids/pull/10399)|Update NOTICE-binary| |[#10389](https://github.com/NVIDIA/spark-rapids/pull/10389)|Change version and branch to 24.02 in docs [skip ci]| +|[#10384](https://github.com/NVIDIA/spark-rapids/pull/10384)|[DOC] Update docs for 23.12.2 release [skip ci] | |[#10309](https://github.com/NVIDIA/spark-rapids/pull/10309)|[DOC] add custom 404 page and fix some document issue [skip ci]| |[#10352](https://github.com/NVIDIA/spark-rapids/pull/10352)|xfail mixed type test| |[#10355](https://github.com/NVIDIA/spark-rapids/pull/10355)|Revert "Support barrier mode for mapInPandas/mapInArrow (#10343)"| @@ -241,6 +411,7 @@ Generated on 2024-03-06 |[#9996](https://github.com/NVIDIA/spark-rapids/pull/9996)|Test full timestamp output range in PySpark| |[#10081](https://github.com/NVIDIA/spark-rapids/pull/10081)|Add a fallback Cloudera Maven repo URL [skip ci]| |[#10065](https://github.com/NVIDIA/spark-rapids/pull/10065)|Improve host memory spill interfaces| +|[#10069](https://github.com/NVIDIA/spark-rapids/pull/10069)|Revert "Support split broadcast join condition into ast and non-ast […| |[#10070](https://github.com/NVIDIA/spark-rapids/pull/10070)|Fix 332db build failure| |[#10060](https://github.com/NVIDIA/spark-rapids/pull/10060)|Fix failed cases for non-utc time zone| |[#10038](https://github.com/NVIDIA/spark-rapids/pull/10038)|Remove spark.rapids.sql.nonUTC.enabled configuration option| @@ -250,6 +421,7 @@ Generated on 2024-03-06 |[#10053](https://github.com/NVIDIA/spark-rapids/pull/10053)|Remove invalid user from CODEOWNER file [skip ci]| |[#10049](https://github.com/NVIDIA/spark-rapids/pull/10049)|Fix out of range error from pySpark in test_timestamp_millis and other two integration test cases| |[#9721](https://github.com/NVIDIA/spark-rapids/pull/9721)|Support date_format via Gpu for non-UTC time zone| +|[#9470](https://github.com/NVIDIA/spark-rapids/pull/9470)|Use float to string kernel| |[#9845](https://github.com/NVIDIA/spark-rapids/pull/9845)|Use parse_url kernel for HOST parsing| |[#10024](https://github.com/NVIDIA/spark-rapids/pull/10024)|Support hour minute second for non-UTC time zone| |[#9973](https://github.com/NVIDIA/spark-rapids/pull/9973)|Batching support for row-based bounded window functions | @@ -259,6 +431,7 @@ Generated on 2024-03-06 |[#10023](https://github.com/NVIDIA/spark-rapids/pull/10023)|GPU supports `yyyyMMdd` format by post process for the `from_unixtime` function| |[#10033](https://github.com/NVIDIA/spark-rapids/pull/10033)|Remove GpuToTimestampImproved and spark.rapids.sql.improvedTimeOps.enabled| |[#10016](https://github.com/NVIDIA/spark-rapids/pull/10016)|Fix infinite loop in test_str_to_map_expr_random_delimiters| +|[#9481](https://github.com/NVIDIA/spark-rapids/pull/9481)|Use parse_url kernel for PROTOCOL parsing| |[#10030](https://github.com/NVIDIA/spark-rapids/pull/10030)|Update links in shims.md| |[#10015](https://github.com/NVIDIA/spark-rapids/pull/10015)|Fix array_transform to not recompute the argument| |[#10011](https://github.com/NVIDIA/spark-rapids/pull/10011)|Add cpu oom retry split handling to InternalRowToColumnarBatchIterator| @@ -286,316 +459,14 @@ Generated on 2024-03-06 |[#9852](https://github.com/NVIDIA/spark-rapids/pull/9852)|Avoid generating duplicate nan keys with MapGen(FloatGen)| |[#9674](https://github.com/NVIDIA/spark-rapids/pull/9674)|Add cache action to speed up mvn workflow [skip ci]| |[#9900](https://github.com/NVIDIA/spark-rapids/pull/9900)|Revert "Remove Databricks 13.3 from release 23.12 (#9890)"| +|[#9889](https://github.com/NVIDIA/spark-rapids/pull/9889)|Fix test_cast_string_ts_valid_format test| |[#9888](https://github.com/NVIDIA/spark-rapids/pull/9888)|Update nightly build and deploy script for arm artifacts [skip ci]| +|[#9833](https://github.com/NVIDIA/spark-rapids/pull/9833)|Fix a hang for Pandas UDFs on DB 13.3| |[#9656](https://github.com/NVIDIA/spark-rapids/pull/9656)|Update for new retry state machine JNI APIs| |[#9654](https://github.com/NVIDIA/spark-rapids/pull/9654)|Detect multiple jars on the classpath when init plugin| |[#9857](https://github.com/NVIDIA/spark-rapids/pull/9857)|Skip redundant steps in nightly build [skip ci]| |[#9812](https://github.com/NVIDIA/spark-rapids/pull/9812)|Update JNI and private dep version to 24.02.0-SNAPSHOT| - -## Release 23.12 - -### Features -||| -|:---|:---| -|[#6832](https://github.com/NVIDIA/spark-rapids/issues/6832)|[FEA] Convert Timestamp/Timezone tests/checks to be per operator instead of generic | -|[#9805](https://github.com/NVIDIA/spark-rapids/issues/9805)|[FEA] Support ```current_date``` expression function with CST (UTC + 8) timezone support| -|[#9515](https://github.com/NVIDIA/spark-rapids/issues/9515)|[FEA] Support temporal types in to_json| -|[#9872](https://github.com/NVIDIA/spark-rapids/issues/9872)|[FEA][JSON] Support Decimal type in `to_json`| -|[#9802](https://github.com/NVIDIA/spark-rapids/issues/9802)|[FEA] Support FromUTCTimestamp on the GPU with a non-UTC time zone| -|[#6831](https://github.com/NVIDIA/spark-rapids/issues/6831)|[FEA] Support timestamp transitions to and from UTC for single time zones with no repeating rules| -|[#9590](https://github.com/NVIDIA/spark-rapids/issues/9590)|[FEA][JSON] Support temporal types in `from_json`| -|[#9804](https://github.com/NVIDIA/spark-rapids/issues/9804)|[FEA] Support CPU path for from_utc_timestamp function with timezone| -|[#9461](https://github.com/NVIDIA/spark-rapids/issues/9461)|[FEA] Validate nvcomp-3.0 with spark rapids plugin| -|[#8832](https://github.com/NVIDIA/spark-rapids/issues/8832)|[FEA] rewrite join conditions where only part of it can fit on the AST| -|[#9059](https://github.com/NVIDIA/spark-rapids/issues/9059)|[FEA] Support spark.sql.parquet.datetimeRebaseModeInRead=LEGACY| -|[#9037](https://github.com/NVIDIA/spark-rapids/issues/9037)|[FEA] Support spark.sql.parquet.int96RebaseModeInWrite= LEGACY| -|[#9632](https://github.com/NVIDIA/spark-rapids/issues/9632)|[FEA] Take into account `org.apache.spark.timeZone` in Parquet/Avro from Spark 3.2| -|[#8770](https://github.com/NVIDIA/spark-rapids/issues/8770)|[FEA] add more metrics to Eventlogs or Executor logs| -|[#9597](https://github.com/NVIDIA/spark-rapids/issues/9597)|[FEA][JSON] Support boolean type in `from_json`| -|[#9516](https://github.com/NVIDIA/spark-rapids/issues/9516)|[FEA] Add support for JSON data source option `ignoreNullFields=false` in `to_json`| -|[#9520](https://github.com/NVIDIA/spark-rapids/issues/9520)|[FEA] Add support for `LAST()` as running window function| -|[#9518](https://github.com/NVIDIA/spark-rapids/issues/9518)|[FEA] Add support for relevant JSON data source options in `to_json`| -|[#9218](https://github.com/NVIDIA/spark-rapids/issues/9218)|[FEA] Support stack function| -|[#9532](https://github.com/NVIDIA/spark-rapids/issues/9532)|[FEA] Support Delta Lake 2.3.0| -|[#1525](https://github.com/NVIDIA/spark-rapids/issues/1525)|[FEA] Support Scala 2.13| -|[#7279](https://github.com/NVIDIA/spark-rapids/issues/7279)|[FEA] Support OverwriteByExpressionExecV1 for Delta Lake| -|[#9326](https://github.com/NVIDIA/spark-rapids/issues/9326)|[FEA] Specify `recover_with_null` when reading JSON files| -|[#8780](https://github.com/NVIDIA/spark-rapids/issues/8780)|[FEA] Support to_json function| -|[#7278](https://github.com/NVIDIA/spark-rapids/issues/7278)|[FEA] Support AppendDataExecV1 for Delta Lake| -|[#6266](https://github.com/NVIDIA/spark-rapids/issues/6266)|[FEA] Support Percentile| -|[#7277](https://github.com/NVIDIA/spark-rapids/issues/7277)|[FEA] Support AtomicReplaceTableAsSelect for Delta Lake| -|[#7276](https://github.com/NVIDIA/spark-rapids/issues/7276)|[FEA] Support AtomicCreateTableAsSelect for Delta Lake| - -### Performance -||| -|:---|:---| -|[#8137](https://github.com/NVIDIA/spark-rapids/issues/8137)|[FEA] Upgrade to UCX 1.15| -|[#8157](https://github.com/NVIDIA/spark-rapids/issues/8157)|[FEA] Add string comparison to AST expressions| -|[#9398](https://github.com/NVIDIA/spark-rapids/issues/9398)|[FEA] Compress/encrypt spill to disk| - -### Bugs Fixed -||| -|:---|:---| -|[#9687](https://github.com/NVIDIA/spark-rapids/issues/9687)|[BUG] `test_in_set` fails when DATAGEN_SEED=1698940723| -|[#9659](https://github.com/NVIDIA/spark-rapids/issues/9659)|[BUG] executor crash intermittantly in scala2.13-built spark332 integration tests| -|[#9923](https://github.com/NVIDIA/spark-rapids/issues/9923)|[BUG] Failed case about ```test_timestamp_seconds_rounding_necessary[Decimal(20,7)][DATAGEN_SEED=1701412018] – src.main.python.date_time_test```| -|[#9982](https://github.com/NVIDIA/spark-rapids/issues/9982)|[BUG] test "convert large InternalRow iterator to cached batch single col" failed with arena pool| -|[#9683](https://github.com/NVIDIA/spark-rapids/issues/9683)|[BUG] test_map_scalars_supported_key_types fails with DATAGEN_SEED=1698940723| -|[#9976](https://github.com/NVIDIA/spark-rapids/issues/9976)|[BUG] test_part_write_round_trip[Float] Failed on -0.0 partition| -|[#9948](https://github.com/NVIDIA/spark-rapids/issues/9948)|[BUG] parquet reader data corruption in nested schema after https://github.com/rapidsai/cudf/pull/13302| -|[#9867](https://github.com/NVIDIA/spark-rapids/issues/9867)|[BUG] Unable to use Spark Rapids with Spark Thrift Server| -|[#9934](https://github.com/NVIDIA/spark-rapids/issues/9934)|[BUG] test_delta_multi_part_write_round_trip_unmanaged and test_delta_part_write_round_trip_unmanaged failed DATA_SEED=1701608331 | -|[#9933](https://github.com/NVIDIA/spark-rapids/issues/9933)|[BUG] collection_ops_test.py::test_sequence_too_long_sequence[Long(not_null)][DATAGEN_SEED=1701553915, INJECT_OOM]| -|[#9837](https://github.com/NVIDIA/spark-rapids/issues/9837)|[BUG] test_part_write_round_trip failed| -|[#9932](https://github.com/NVIDIA/spark-rapids/issues/9932)|[BUG] Failed test_multi_tier_ast[DATAGEN_SEED=1701445668] on CI| -|[#9829](https://github.com/NVIDIA/spark-rapids/issues/9829)|[BUG] Java OOM when testing non-UTC time zone with lots of cases fallback.| -|[#9403](https://github.com/NVIDIA/spark-rapids/issues/9403)|[BUG] test_cogroup_apply_udf[Short(not_null)] failed with pandas 2.1.X| -|[#9684](https://github.com/NVIDIA/spark-rapids/issues/9684)|[BUG] test_coalesce fails with DATAGEN_SEED=1698940723| -|[#9685](https://github.com/NVIDIA/spark-rapids/issues/9685)|[BUG] test_case_when fails with DATAGEN_SEED=1698940723| -|[#9776](https://github.com/NVIDIA/spark-rapids/issues/9776)|[BUG] fastparquet compatibility tests fail with data mismatch if TZ is not set and system timezone is not UTC| -|[#9733](https://github.com/NVIDIA/spark-rapids/issues/9733)|[BUG] Complex AST expressions can crash with non-matching operand type error| -|[#9877](https://github.com/NVIDIA/spark-rapids/issues/9877)|[BUG] Fix resource leak in to_json| -|[#9722](https://github.com/NVIDIA/spark-rapids/issues/9722)|[BUG] test_floor_scale_zero fails with DATAGEN_SEED=1700009407| -|[#9846](https://github.com/NVIDIA/spark-rapids/issues/9846)|[BUG] test_ceil_scale_zero may fail with different datagen_seed| -|[#9781](https://github.com/NVIDIA/spark-rapids/issues/9781)|[BUG] test_cast_string_date_valid_format fails on DATAGEN_SEED=1700250017| -|[#9714](https://github.com/NVIDIA/spark-rapids/issues/9714)|Scala Map class not found when executing the benchmark on Spark 3.5.0 with Scala 2.13| -|[#9856](https://github.com/NVIDIA/spark-rapids/issues/9856)|collection_ops_test.py failed on Dataproc-2.1 with: Column 'None' does not exist| -|[#9397](https://github.com/NVIDIA/spark-rapids/issues/9397)|[BUG] RapidsShuffleManager MULTITHREADED on Databricks, we see loss of executors due to Rpc issues| -|[#9738](https://github.com/NVIDIA/spark-rapids/issues/9738)|[BUG] `test_delta_part_write_round_trip_unmanaged` and `test_delta_multi_part_write_round_trip_unmanaged` fail with `DATAGEN_SEED=1700105176`| -|[#9771](https://github.com/NVIDIA/spark-rapids/issues/9771)|[BUG] ast_test.py::test_X[(String, True)][DATAGEN_SEED=1700205785] failed| -|[#9782](https://github.com/NVIDIA/spark-rapids/issues/9782)|[BUG] Error messages appear in a clean build| -|[#9798](https://github.com/NVIDIA/spark-rapids/issues/9798)|[BUG] GpuCheckOverflowInTableInsert should be added to databricks shim| -|[#9820](https://github.com/NVIDIA/spark-rapids/issues/9820)|[BUG] test_parquet_write_roundtrip_datetime_with_legacy_rebase fails with "year 0 is out of range"| -|[#9817](https://github.com/NVIDIA/spark-rapids/issues/9817)|[BUG] FAILED dpp_test.py::test_dpp_reuse_broadcast_exchange[false-0-parquet][DATAGEN_SEED=1700572856, IGNORE_ORDER]| -|[#9768](https://github.com/NVIDIA/spark-rapids/issues/9768)|[BUG] `cast decimal to string` ScalaTest relies on a side effects | -|[#9711](https://github.com/NVIDIA/spark-rapids/issues/9711)|[BUG] test_lte fails with DATAGEN_SEED=1699987762| -|[#9751](https://github.com/NVIDIA/spark-rapids/issues/9751)|[BUG] cmp_test test_gte failed with DATAGEN_SEED=1700149611| -|[#9469](https://github.com/NVIDIA/spark-rapids/issues/9469)|[BUG] [main] ERROR com.nvidia.spark.rapids.GpuOverrideUtil - Encountered an exception applying GPU overrides java.lang.IllegalStateException: the broadcast must be on the GPU too| -|[#9648](https://github.com/NVIDIA/spark-rapids/issues/9648)|[BUG] Existence default values in schema are not being honored| -|[#9676](https://github.com/NVIDIA/spark-rapids/issues/9676)|Fix Delta Lake Integration tests; `test_delta_atomic_create_table_as_select` and `test_delta_atomic_replace_table_as_select`| -|[#9701](https://github.com/NVIDIA/spark-rapids/issues/9701)|[BUG] test_ts_formats_round_trip and test_datetime_roundtrip_with_legacy_rebase fail with DATAGEN_SEED=1699915317| -|[#9691](https://github.com/NVIDIA/spark-rapids/issues/9691)|[BUG] Repeated Maven invocations w/o changes recompile too many Scala sources despite recompileMode=incremental | -|[#9547](https://github.com/NVIDIA/spark-rapids/issues/9547)|Update buildall and doc to generate bloop projects for test debugging| -|[#9697](https://github.com/NVIDIA/spark-rapids/issues/9697)|[BUG] Iceberg multiple file readers can not read files if the file paths contain encoded URL unsafe chars| -|[#9681](https://github.com/NVIDIA/spark-rapids/issues/9681)|Databricks Build Failing For 330db+| -|[#9521](https://github.com/NVIDIA/spark-rapids/issues/9521)|[BUG] Multi Threaded Shuffle Writer needs flow control| -|[#9675](https://github.com/NVIDIA/spark-rapids/issues/9675)|Failing Delta Lake Tests for Databricks 13.3 Due to WriteIntoDeltaCommand| -|[#9669](https://github.com/NVIDIA/spark-rapids/issues/9669)|[BUG] Rebase exception states not in UTC but timezone is Etc/UTC| -|[#7940](https://github.com/NVIDIA/spark-rapids/issues/7940)|[BUG] UCX peer connection issue in multi-nic single node cluster| -|[#9650](https://github.com/NVIDIA/spark-rapids/issues/9650)|[BUG] Github workflow for missing scala2.13 updates fails to detect when pom is new| -|[#9621](https://github.com/NVIDIA/spark-rapids/issues/9621)|[BUG] Scala 2.13 with-classifier profile is picking up Scala2.12 spark.version| -|[#9636](https://github.com/NVIDIA/spark-rapids/issues/9636)|[BUG] All parquet integration tests failed "Part of the plan is not columnar class" in databricks runtimes| -|[#9108](https://github.com/NVIDIA/spark-rapids/issues/9108)|[BUG] nullability on some decimal operations is wrong| -|[#9625](https://github.com/NVIDIA/spark-rapids/issues/9625)|[BUG] Typo in github Maven check install-modules | -|[#9603](https://github.com/NVIDIA/spark-rapids/issues/9603)|[BUG] fastparquet_compatibility_test fails on dataproc| -|[#8729](https://github.com/NVIDIA/spark-rapids/issues/8729)|[BUG] nightly integration test failed OOM kill in JDK11 ENV| -|[#9589](https://github.com/NVIDIA/spark-rapids/issues/9589)|[BUG] Scala 2.13 build hard-codes Java 8 target | -|[#9581](https://github.com/NVIDIA/spark-rapids/issues/9581)|Delta Lake 2.4 missing equals/hashCode override for file format and some metrics for merge| -|[#9507](https://github.com/NVIDIA/spark-rapids/issues/9507)|[BUG] Spark 3.2+/ParquetFilterSuite/Parquet filter pushdown - timestamp/ FAILED | -|[#9540](https://github.com/NVIDIA/spark-rapids/issues/9540)|[BUG] Job failed with SparkUpgradeException no matter which value are set for spark.sql.parquet.datetimeRebaseModeInRead| -|[#9545](https://github.com/NVIDIA/spark-rapids/issues/9545)|[BUG] Dataproc 2.0 test_reading_file_rewritten_with_fastparquet tests failing| -|[#9552](https://github.com/NVIDIA/spark-rapids/issues/9552)|[BUG] Inconsistent CDH dependency overrides across submodules| -|[#9571](https://github.com/NVIDIA/spark-rapids/issues/9571)|[BUG] non-deterministic compiled SQLExecPlugin.class with scala 2.13 deployment| -|[#9569](https://github.com/NVIDIA/spark-rapids/issues/9569)|[BUG] test_window_running failed in 3.1.2+3.1.3| -|[#9480](https://github.com/NVIDIA/spark-rapids/issues/9480)|[BUG] mapInPandas doesn't invoke udf on empty partitions| -|[#8644](https://github.com/NVIDIA/spark-rapids/issues/8644)|[BUG] Parquet file with malformed dictionary does not error when loaded| -|[#9310](https://github.com/NVIDIA/spark-rapids/issues/9310)|[BUG] Improve support for reading JSON files with malformed rows| -|[#9457](https://github.com/NVIDIA/spark-rapids/issues/9457)|[BUG] CDH 332 unit tests failing| -|[#9404](https://github.com/NVIDIA/spark-rapids/issues/9404)|[BUG] Spark reports a decimal error when create lit scalar when generate Decimal(34, -5) data.| -|[#9110](https://github.com/NVIDIA/spark-rapids/issues/9110)|[BUG] GPU Reader fails due to partition column creating column larger then cudf column size limit| -|[#8631](https://github.com/NVIDIA/spark-rapids/issues/8631)|[BUG] Parquet load failure on repeated_no_annotation.parquet| -|[#9364](https://github.com/NVIDIA/spark-rapids/issues/9364)|[BUG] CUDA illegal access error is triggering split and retry logic| - -### PRs -||| -|:---|:---| -|[#10384](https://github.com/NVIDIA/spark-rapids/pull/10384)|[DOC] Update docs for 23.12.2 release [skip ci] | -|[#10341](https://github.com/NVIDIA/spark-rapids/pull/10341)|Update changelog for v23.12.2 [skip ci]| -|[#10340](https://github.com/NVIDIA/spark-rapids/pull/10340)|Copyright to 2024 [skip ci]| -|[#10323](https://github.com/NVIDIA/spark-rapids/pull/10323)|Upgrade version to 23.12.2-SNAPSHOT| -|[#10329](https://github.com/NVIDIA/spark-rapids/pull/10329)|update download page for v23.12.2 release [skip ci]| -|[#10274](https://github.com/NVIDIA/spark-rapids/pull/10274)|PythonRunner Changes| -|[#10124](https://github.com/NVIDIA/spark-rapids/pull/10124)|Update changelog for v23.12.1 [skip ci]| -|[#10123](https://github.com/NVIDIA/spark-rapids/pull/10123)|Change version to v23.12.1 [skip ci]| -|[#10122](https://github.com/NVIDIA/spark-rapids/pull/10122)|Init changelog for v23.12.1 [skip ci]| -|[#10121](https://github.com/NVIDIA/spark-rapids/pull/10121)|[DOC] update download page for db hot fix [skip ci]| -|[#10116](https://github.com/NVIDIA/spark-rapids/pull/10116)|Upgrade to 23.12.1-SNAPSHOT| -|[#10069](https://github.com/NVIDIA/spark-rapids/pull/10069)|Revert "Support split broadcast join condition into ast and non-ast […| -|[#9470](https://github.com/NVIDIA/spark-rapids/pull/9470)|Use float to string kernel| -|[#9481](https://github.com/NVIDIA/spark-rapids/pull/9481)|Use parse_url kernel for PROTOCOL parsing| -|[#9935](https://github.com/NVIDIA/spark-rapids/pull/9935)|Init 23.12 changelog [skip ci]| -|[#9943](https://github.com/NVIDIA/spark-rapids/pull/9943)|[DOC] Update docs for 23.12.0 release [skip ci]| -|[#10014](https://github.com/NVIDIA/spark-rapids/pull/10014)|Add documentation for how to run tests with a fixed datagen seed [skip ci]| -|[#9954](https://github.com/NVIDIA/spark-rapids/pull/9954)|Update private and JNI version to released 23.12.0| -|[#10009](https://github.com/NVIDIA/spark-rapids/pull/10009)|Using fix seed to unblock 23.12 release; Move the blocked issues to 24.02| -|[#10007](https://github.com/NVIDIA/spark-rapids/pull/10007)|Fix Java OOM in non-UTC case with lots of xfail (#9944)| -|[#9985](https://github.com/NVIDIA/spark-rapids/pull/9985)|Avoid allocating GPU memory out of RMM managed pool in test| -|[#9970](https://github.com/NVIDIA/spark-rapids/pull/9970)|Avoid leading and trailing zeros in test_timestamp_seconds_rounding_necessary| -|[#9978](https://github.com/NVIDIA/spark-rapids/pull/9978)|Avoid using floating point values as partition values in tests| -|[#9979](https://github.com/NVIDIA/spark-rapids/pull/9979)|Add compatibility notes for writing ORC with lost Gregorian days [skip ci]| -|[#9949](https://github.com/NVIDIA/spark-rapids/pull/9949)|Override the seed for `test_map_scalars_supported_key_types ` for version of Spark before 3.4.0 [Databricks]| -|[#9961](https://github.com/NVIDIA/spark-rapids/pull/9961)|Avoid using floating point for partition values in Delta Lake tests| -|[#9960](https://github.com/NVIDIA/spark-rapids/pull/9960)|Fix LongGen accidentally using special cases when none are desired| -|[#9950](https://github.com/NVIDIA/spark-rapids/pull/9950)|Avoid generating NaNs as partition values in test_part_write_round_trip| -|[#9940](https://github.com/NVIDIA/spark-rapids/pull/9940)|Fix 'year 0 is out of range' by setting a fix seed| -|[#9946](https://github.com/NVIDIA/spark-rapids/pull/9946)|Fix test_multi_tier_ast to ignore ordering of output rows| -|[#9928](https://github.com/NVIDIA/spark-rapids/pull/9928)|Test `inset` with `NaN` only for Spark from 3.1.3| -|[#9906](https://github.com/NVIDIA/spark-rapids/pull/9906)|Fix test_initcap to use the intended limited character set| -|[#9831](https://github.com/NVIDIA/spark-rapids/pull/9831)|Skip fastparquet timestamp tests when plugin cannot read/write timestamps| -|[#9893](https://github.com/NVIDIA/spark-rapids/pull/9893)|Add multiple expression tier regression test for AST| -|[#9889](https://github.com/NVIDIA/spark-rapids/pull/9889)|Fix test_cast_string_ts_valid_format test| -|[#9833](https://github.com/NVIDIA/spark-rapids/pull/9833)|Fix a hang for Pandas UDFs on DB 13.3| -|[#9873](https://github.com/NVIDIA/spark-rapids/pull/9873)|Add support for decimal in `to_json`| -|[#9890](https://github.com/NVIDIA/spark-rapids/pull/9890)|Remove Databricks 13.3 from release 23.12| -|[#9874](https://github.com/NVIDIA/spark-rapids/pull/9874)|Fix zero-scale floor and ceil tests| -|[#9879](https://github.com/NVIDIA/spark-rapids/pull/9879)|Fix resource leak in to_json| -|[#9600](https://github.com/NVIDIA/spark-rapids/pull/9600)|Add date and timestamp support to to_json| -|[#9871](https://github.com/NVIDIA/spark-rapids/pull/9871)|Fix test_cast_string_date_valid_format generating year 0| -|[#9885](https://github.com/NVIDIA/spark-rapids/pull/9885)|Preparation for non-UTC nightly CI [skip ci]| -|[#9810](https://github.com/NVIDIA/spark-rapids/pull/9810)|Support from_utc_timestamp on the GPU for non-UTC timezones (non-DST)| -|[#9865](https://github.com/NVIDIA/spark-rapids/pull/9865)|Fix problems with nulls in sequence tests| -|[#9864](https://github.com/NVIDIA/spark-rapids/pull/9864)|Add compatibility documentation with respect to decimal overflow detection [skip ci]| -|[#9860](https://github.com/NVIDIA/spark-rapids/pull/9860)|Fixing FAQ deadlink in plugin code [skip ci]| -|[#9840](https://github.com/NVIDIA/spark-rapids/pull/9840)|Avoid using NaNs as Delta Lake partition values| -|[#9773](https://github.com/NVIDIA/spark-rapids/pull/9773)|xfail all the impacted cases when using non-UTC time zone| -|[#9849](https://github.com/NVIDIA/spark-rapids/pull/9849)|Instantly Delete pre-merge content of stage workspace if success| -|[#9848](https://github.com/NVIDIA/spark-rapids/pull/9848)|Force datagen_seed for test_ceil_scale_zero and test_decimal_round| -|[#9677](https://github.com/NVIDIA/spark-rapids/pull/9677)|Enable build for Databricks 13.3| -|[#9809](https://github.com/NVIDIA/spark-rapids/pull/9809)|Re-enable AST string integration cases| -|[#9835](https://github.com/NVIDIA/spark-rapids/pull/9835)|Avoid pre-Gregorian dates in schema_evolution_test| -|[#9786](https://github.com/NVIDIA/spark-rapids/pull/9786)|Check paths for existence to prevent ignorable error messages during build| -|[#9824](https://github.com/NVIDIA/spark-rapids/pull/9824)|UCX 1.15 upgrade| -|[#9800](https://github.com/NVIDIA/spark-rapids/pull/9800)|Add GpuCheckOverflowInTableInsert to Databricks 11.3+| -|[#9821](https://github.com/NVIDIA/spark-rapids/pull/9821)|Update timestamp gens to avoid "year 0 is out of range" errors| -|[#9826](https://github.com/NVIDIA/spark-rapids/pull/9826)|Set seed to 0 for test_hash_reduction_sum| -|[#9720](https://github.com/NVIDIA/spark-rapids/pull/9720)|Support timestamp in `from_json`| -|[#9818](https://github.com/NVIDIA/spark-rapids/pull/9818)|Specify nullable=False when generating filter values in dpp tests| -|[#9689](https://github.com/NVIDIA/spark-rapids/pull/9689)|Support CPU path for from_utc_timestamp function with timezone | -|[#9769](https://github.com/NVIDIA/spark-rapids/pull/9769)|Use withGpuSparkSession to customize SparkConf| -|[#9780](https://github.com/NVIDIA/spark-rapids/pull/9780)|Fix NaN handling in GpuLessThanOrEqual and GpuGreaterThanOrEqual| -|[#9795](https://github.com/NVIDIA/spark-rapids/pull/9795)|xfail AST string tests| -|[#9666](https://github.com/NVIDIA/spark-rapids/pull/9666)|Add support for parsing strings as dates in `from_json`| -|[#9673](https://github.com/NVIDIA/spark-rapids/pull/9673)|Fix the broadcast joins issues caused by InputFileBlockRule| -|[#9785](https://github.com/NVIDIA/spark-rapids/pull/9785)|Force datagen_seed for 9781 and 9784 [skip ci]| -|[#9765](https://github.com/NVIDIA/spark-rapids/pull/9765)|Let GPU scans fall back when default values exist in schema| -|[#9729](https://github.com/NVIDIA/spark-rapids/pull/9729)|Fix Delta Lake atomic table operations on spark341db| -|[#9770](https://github.com/NVIDIA/spark-rapids/pull/9770)|[BUG] Fix the doc for Maven and Scala 2.13 test example [skip ci]| -|[#9761](https://github.com/NVIDIA/spark-rapids/pull/9761)|Fix bug in tagging of JsonToStructs| -|[#9758](https://github.com/NVIDIA/spark-rapids/pull/9758)|Remove forced seed from Delta Lake part_write_round_trip_unmanaged tests| -|[#9652](https://github.com/NVIDIA/spark-rapids/pull/9652)|Add time zone config to set non-UTC| -|[#9736](https://github.com/NVIDIA/spark-rapids/pull/9736)|Fix `TimestampGen` to generate value not too close to the minimum allowed timestamp| -|[#9698](https://github.com/NVIDIA/spark-rapids/pull/9698)|Speed up build: unnecessary invalidation in the incremental recompile mode| -|[#9748](https://github.com/NVIDIA/spark-rapids/pull/9748)|Fix Delta Lake part_write_round_trip_unmanaged tests with floating point| -|[#9702](https://github.com/NVIDIA/spark-rapids/pull/9702)|Support split BroadcastNestedLoopJoin condition for AST and non-AST| -|[#9746](https://github.com/NVIDIA/spark-rapids/pull/9746)|Force test_hypot to be single seed for now| -|[#9745](https://github.com/NVIDIA/spark-rapids/pull/9745)|Avoid generating null filter values in test_delta_dfp_reuse_broadcast_exchange| -|[#9741](https://github.com/NVIDIA/spark-rapids/pull/9741)|Set seed=0 for the delta lake part roundtrip tests| -|[#9660](https://github.com/NVIDIA/spark-rapids/pull/9660)|Fully support date/time legacy rebase for nested input| -|[#9672](https://github.com/NVIDIA/spark-rapids/pull/9672)|Support String type for AST| |[#9716](https://github.com/NVIDIA/spark-rapids/pull/9716)|Initiate project version 24.02.0-SNAPSHOT| -|[#9732](https://github.com/NVIDIA/spark-rapids/pull/9732)|Temporarily force `datagen_seed=0` for `test_re_replace_all` to unblock CI| -|[#9726](https://github.com/NVIDIA/spark-rapids/pull/9726)|Fix leak in BatchWithPartitionData| -|[#9717](https://github.com/NVIDIA/spark-rapids/pull/9717)|Encode the file path from Iceberg when converting to a PartitionedFile| -|[#9441](https://github.com/NVIDIA/spark-rapids/pull/9441)|Add a random seed specific to datagen cases| -|[#9649](https://github.com/NVIDIA/spark-rapids/pull/9649)|Support `spark.sql.parquet.datetimeRebaseModeInRead=LEGACY` and `spark.sql.parquet.int96RebaseModeInRead=LEGACY`| -|[#9612](https://github.com/NVIDIA/spark-rapids/pull/9612)|Escape quotes and newlines when converting strings to json format in to_json| -|[#9644](https://github.com/NVIDIA/spark-rapids/pull/9644)|Add Partial Delta Lake Support for Databricks 13.3| -|[#9690](https://github.com/NVIDIA/spark-rapids/pull/9690)|Changed `extractExecutedPlan` to consider ResultQueryStageExec for Databricks 13.3| -|[#9686](https://github.com/NVIDIA/spark-rapids/pull/9686)|Removed Maven Profiles From `tests/pom.xml`| -|[#9509](https://github.com/NVIDIA/spark-rapids/pull/9509)|Fine-grained spill metrics| -|[#9658](https://github.com/NVIDIA/spark-rapids/pull/9658)|Support `spark.sql.parquet.int96RebaseModeInWrite=LEGACY`| -|[#9695](https://github.com/NVIDIA/spark-rapids/pull/9695)|Revert "Support split non-AST-able join condition for BroadcastNested…| -|[#9693](https://github.com/NVIDIA/spark-rapids/pull/9693)|Enable automerge from 23.12 to 24.02 [skip ci]| -|[#9679](https://github.com/NVIDIA/spark-rapids/pull/9679)|[Doc] update the dead link in download page [skip ci]| -|[#9678](https://github.com/NVIDIA/spark-rapids/pull/9678)|Add flow control for multithreaded shuffle writer| -|[#9635](https://github.com/NVIDIA/spark-rapids/pull/9635)|Support split non-AST-able join condition for BroadcastNestedLoopJoin| -|[#9646](https://github.com/NVIDIA/spark-rapids/pull/9646)|Fix Integration Test Failures for Databricks 13.3 Support| -|[#9670](https://github.com/NVIDIA/spark-rapids/pull/9670)|Normalize file timezone and handle missing file timezone in datetimeRebaseUtils| -|[#9657](https://github.com/NVIDIA/spark-rapids/pull/9657)|Update verify check to handle new pom files [skip ci]| -|[#9663](https://github.com/NVIDIA/spark-rapids/pull/9663)|Making User Guide info in bold and adding it as top right link in github.io [skip ci]| -|[#9609](https://github.com/NVIDIA/spark-rapids/pull/9609)|Add valid retry solution to mvn-verify [skip ci]| -|[#9655](https://github.com/NVIDIA/spark-rapids/pull/9655)|Document problem with handling of invalid characters in CSV reader| -|[#9620](https://github.com/NVIDIA/spark-rapids/pull/9620)|Add support for parsing boolean values in `from_json`| -|[#9615](https://github.com/NVIDIA/spark-rapids/pull/9615)|Bloop updates - require JDK11 in buildall + docs, build bloop for all targets.| -|[#9631](https://github.com/NVIDIA/spark-rapids/pull/9631)|Refactor Parquet readers| -|[#9637](https://github.com/NVIDIA/spark-rapids/pull/9637)|Added Support For Various Execs for Databricks 13.3 | -|[#9640](https://github.com/NVIDIA/spark-rapids/pull/9640)|Add support for `ignoreNullFields=false` in `to_json`| -|[#9623](https://github.com/NVIDIA/spark-rapids/pull/9623)|Running window optimization for `LAST()`| -|[#9641](https://github.com/NVIDIA/spark-rapids/pull/9641)|Revert "Support rebase checking for nested dates and timestamps (#9617)"| -|[#9423](https://github.com/NVIDIA/spark-rapids/pull/9423)|Re-enable `from_json` / `JsonToStructs`| -|[#9624](https://github.com/NVIDIA/spark-rapids/pull/9624)|Add jenkins-level retry for pre-merge build in databricks runtimes| -|[#9608](https://github.com/NVIDIA/spark-rapids/pull/9608)|Fix nullability issues for some decimal operations| -|[#9617](https://github.com/NVIDIA/spark-rapids/pull/9617)|Support rebase checking for nested dates and timestamps| -|[#9611](https://github.com/NVIDIA/spark-rapids/pull/9611)|Move simple classes after refactoring to sql-plugin-api| -|[#9618](https://github.com/NVIDIA/spark-rapids/pull/9618)|Remove unused dataTypes argument from HostShuffleCoalesceIterator| -|[#9626](https://github.com/NVIDIA/spark-rapids/pull/9626)|Fix ENV typo in pre-merge github actions [skip ci]| -|[#9593](https://github.com/NVIDIA/spark-rapids/pull/9593)|PythonRunner and RapidsErrorUtils Changes For Databricks 13.3| -|[#9607](https://github.com/NVIDIA/spark-rapids/pull/9607)|Integration tests: Install specific fastparquet version.| -|[#9610](https://github.com/NVIDIA/spark-rapids/pull/9610)|Propagate local properties to broadcast execs| -|[#9544](https://github.com/NVIDIA/spark-rapids/pull/9544)|Support batching for `RANGE` running window aggregations. Including on| -|[#9601](https://github.com/NVIDIA/spark-rapids/pull/9601)|Remove usage of deprecated scala.Proxy| -|[#9591](https://github.com/NVIDIA/spark-rapids/pull/9591)|Enable implicit JDK profile activation| -|[#9586](https://github.com/NVIDIA/spark-rapids/pull/9586)|Merge metrics and file format fixes to Delta 2.4 support| -|[#9594](https://github.com/NVIDIA/spark-rapids/pull/9594)|Revert "Ignore failing Parquet filter test to unblock CI (#9519)"| -|[#9454](https://github.com/NVIDIA/spark-rapids/pull/9454)|Support encryption and compression in disk store| -|[#9439](https://github.com/NVIDIA/spark-rapids/pull/9439)|Support stack function| -|[#9583](https://github.com/NVIDIA/spark-rapids/pull/9583)|Fix fastparquet tests to work with HDFS| -|[#9508](https://github.com/NVIDIA/spark-rapids/pull/9508)|Consolidate deps switching in an intermediate pom| -|[#9562](https://github.com/NVIDIA/spark-rapids/pull/9562)|Delta Lake 2.3.0 support| -|[#9576](https://github.com/NVIDIA/spark-rapids/pull/9576)|Move Stack classes to wrapper classes to fix non-deterministic build issue| -|[#9572](https://github.com/NVIDIA/spark-rapids/pull/9572)|Add retry for CrossJoinIterator and ConditionalNestedLoopJoinIterator| -|[#9575](https://github.com/NVIDIA/spark-rapids/pull/9575)|Fix `test_window_running*()` for `NTH_VALUE IGNORE NULLS`.| -|[#9574](https://github.com/NVIDIA/spark-rapids/pull/9574)|Fix broken #endif scala comments [skip ci]| -|[#9568](https://github.com/NVIDIA/spark-rapids/pull/9568)|Enforce Apache 3.3.0+ for Scala 2.13| -|[#9557](https://github.com/NVIDIA/spark-rapids/pull/9557)|Support launching Map Pandas UDF on empty partitions| -|[#9489](https://github.com/NVIDIA/spark-rapids/pull/9489)|Batching support for ROW-based `FIRST()` window function| -|[#9510](https://github.com/NVIDIA/spark-rapids/pull/9510)|Add Databricks 13.3 shim boilerplate code and refactor Databricks 12.2 shim| -|[#9554](https://github.com/NVIDIA/spark-rapids/pull/9554)|Fix fastparquet installation for| -|[#9536](https://github.com/NVIDIA/spark-rapids/pull/9536)|Add CPU POC of TimeZoneDB; Test some time zones by comparing CPU POC and Spark| -|[#9558](https://github.com/NVIDIA/spark-rapids/pull/9558)|Support integration test against scala2.13 spark binaries[skip ci]| -|[#8592](https://github.com/NVIDIA/spark-rapids/pull/8592)|Scala 2.13 Support| -|[#9551](https://github.com/NVIDIA/spark-rapids/pull/9551)|Enable malformed Parquet failure test| -|[#9546](https://github.com/NVIDIA/spark-rapids/pull/9546)|Support OverwriteByExpressionExecV1 for Delta Lake tables| -|[#9527](https://github.com/NVIDIA/spark-rapids/pull/9527)|Support Split And Retry for GpuProjectAstExec| -|[#9541](https://github.com/NVIDIA/spark-rapids/pull/9541)|Move simple classes to API| -|[#9548](https://github.com/NVIDIA/spark-rapids/pull/9548)|Append new authorized user to blossom-ci whitelist [skip ci]| -|[#9418](https://github.com/NVIDIA/spark-rapids/pull/9418)|Fix STRUCT comparison between Pandas and Spark dataframes in fastparquet tests| -|[#9468](https://github.com/NVIDIA/spark-rapids/pull/9468)|Add SplitAndRetry to GpuRunningWindowIterator| -|[#9486](https://github.com/NVIDIA/spark-rapids/pull/9486)|Add partial support for `to_json`| -|[#9538](https://github.com/NVIDIA/spark-rapids/pull/9538)|Fix tiered project breaking higher order functions| -|[#9539](https://github.com/NVIDIA/spark-rapids/pull/9539)|Add delta-24x to delta-lake/README.md [skip ci]| -|[#9534](https://github.com/NVIDIA/spark-rapids/pull/9534)|Add pyarrow tests for Databricks runtime| -|[#9444](https://github.com/NVIDIA/spark-rapids/pull/9444)|Remove redundant pass-through shuffle manager classes| -|[#9531](https://github.com/NVIDIA/spark-rapids/pull/9531)|Fix relative path for spark-shell nightly test [skip ci]| -|[#9525](https://github.com/NVIDIA/spark-rapids/pull/9525)|Follow-up to dbdeps consolidation| -|[#9506](https://github.com/NVIDIA/spark-rapids/pull/9506)|Move ProxyShuffleInternalManagerBase to api| -|[#9504](https://github.com/NVIDIA/spark-rapids/pull/9504)|Add a spark-shell smoke test to premerge and nightly| -|[#9519](https://github.com/NVIDIA/spark-rapids/pull/9519)|Ignore failing Parquet filter test to unblock CI| -|[#9478](https://github.com/NVIDIA/spark-rapids/pull/9478)|Support AppendDataExecV1 for Delta Lake tables| -|[#9366](https://github.com/NVIDIA/spark-rapids/pull/9366)|Add tests to check compatibility with `fastparquet`| -|[#9419](https://github.com/NVIDIA/spark-rapids/pull/9419)|Add retry to RoundRobin Partitioner and Range Partitioner| -|[#9502](https://github.com/NVIDIA/spark-rapids/pull/9502)|Install Dependencies Needed For Databricks 13.3| -|[#9296](https://github.com/NVIDIA/spark-rapids/pull/9296)|Implement `percentile` aggregation| -|[#9488](https://github.com/NVIDIA/spark-rapids/pull/9488)|Add Shim JSON Headers for Databricks 13.3| -|[#9443](https://github.com/NVIDIA/spark-rapids/pull/9443)|Add AtomicReplaceTableAsSelectExec support for Delta Lake| -|[#9476](https://github.com/NVIDIA/spark-rapids/pull/9476)|Refactor common Delta Lake test code| -|[#9463](https://github.com/NVIDIA/spark-rapids/pull/9463)|Fix Cloudera 3.3.2 shim for handling CheckOverflowInTableInsert and orc zstd support| -|[#9460](https://github.com/NVIDIA/spark-rapids/pull/9460)|Update links in old release notes to new doc locations [skip ci]| -|[#9405](https://github.com/NVIDIA/spark-rapids/pull/9405)|Wrap scalar generation into spark session in integration test| -|[#9459](https://github.com/NVIDIA/spark-rapids/pull/9459)|Fix 332cdh build [skip ci]| -|[#9425](https://github.com/NVIDIA/spark-rapids/pull/9425)|Add support for AtomicCreateTableAsSelect with Delta Lake| -|[#9434](https://github.com/NVIDIA/spark-rapids/pull/9434)|Add retry support to `HostToGpuCoalesceIterator.concatAllAndPutOnGPU`| -|[#9453](https://github.com/NVIDIA/spark-rapids/pull/9453)|Update codeowner and blossom-ci ACL [skip ci]| -|[#9396](https://github.com/NVIDIA/spark-rapids/pull/9396)|Add support for Cloudera CDS-3.3.2| -|[#9380](https://github.com/NVIDIA/spark-rapids/pull/9380)|Fix parsing of Parquet legacy list-of-struct format| -|[#9438](https://github.com/NVIDIA/spark-rapids/pull/9438)|Fix auto merge conflict 9437 [skip ci]| -|[#9424](https://github.com/NVIDIA/spark-rapids/pull/9424)|Refactor aggregate functions| -|[#9414](https://github.com/NVIDIA/spark-rapids/pull/9414)|Add retry to GpuHashJoin.filterNulls| -|[#9388](https://github.com/NVIDIA/spark-rapids/pull/9388)|Add developer documentation about working with data sources [skip ci]| -|[#9369](https://github.com/NVIDIA/spark-rapids/pull/9369)|Improve JSON empty row fix to use less memory| -|[#9373](https://github.com/NVIDIA/spark-rapids/pull/9373)|Fix auto merge conflict 9372| -|[#9308](https://github.com/NVIDIA/spark-rapids/pull/9308)|Initiate arm64 CI support [skip ci]| -|[#9292](https://github.com/NVIDIA/spark-rapids/pull/9292)|Init project version 23.12.0-SNAPSHOT| ## Older Releases Changelog of older releases can be found at [docs/archives](/docs/archives) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a32e3ffa5f4..77bf860f40d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -130,15 +130,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests Verify that shim-specific classes are hidden from a conventional classloader. ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.04.0-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl ``` However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 -Warning: File dist/target/rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.04.0-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +Warning: File dist/target/rapids-4-spark_2.12-24.04.0-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -181,7 +181,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable: ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +$ javap -cp dist/target/rapids-4-spark_2.12-24.04.0-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` diff --git a/README.md b/README.md index a5ec2d19828..96e4802d0d7 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 24.02.0 + 24.04.0 provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index 4844b6b1e9f..d21a93a7562 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.02.0 + 24.04.0 ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.02.0 + 24.04.0 aggregator @@ -369,23 +369,6 @@ - - release321db - - - buildver - 321db - - - - - com.nvidia - rapids-4-spark-delta-spark321db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - release322 @@ -762,5 +745,22 @@ + + release351 + + + buildver + 351 + + + + + com.nvidia + rapids-4-spark-delta-stub_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + diff --git a/api_validation/pom.xml b/api_validation/pom.xml index 173ff248947..0feb6ce23ef 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.02.0 + 24.04.0 ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 24.02.0 + 24.04.0 api_validation diff --git a/build/buildall b/build/buildall index 356efa2d46d..e8c0610deb7 100755 --- a/build/buildall +++ b/build/buildall @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -264,7 +264,7 @@ function build_single_shim() { -Drat.skip="$SKIP_CHECKS" \ -Dmaven.scaladoc.skip \ -Dmaven.scalastyle.skip="$SKIP_CHECKS" \ - -pl aggregator -am > "$LOG_FILE" 2>&1 || { + -pl tools -am > "$LOG_FILE" 2>&1 || { [[ "$LOG_FILE" != "/dev/tty" ]] && echo "$LOG_FILE:" && tail -20 "$LOG_FILE" || true exit 255 } diff --git a/datagen/README.md b/datagen/README.md index 5fc3aa06de3..983f2de5e2c 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 24.02.0 release would be -`target/datagen_2.12-24.02.0-spark330.jar` +for example a Spark 3.3.0 jar for the 24.04.0 release would be +`target/datagen_2.12-24.04.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-24.02.0-spark330.jar +spark-shell --jars target/datagen_2.12-24.04.0-spark330.jar ``` After that you should be good to go. @@ -544,4 +544,4 @@ flexible. # Scale Test Data Generation Entry In order to generate large scale dataset to test the query engine, we use the data generation library above to create a test suite. For more details like the data schema, -how to use the test suite etc, please refer to [ScaleTest.md](./ScaleTest.md). \ No newline at end of file +how to use the test suite etc, please refer to [ScaleTest.md](./ScaleTest.md). diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index 7e8e7ad1b66..b892da20c1d 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-24.02.0-spark332.jar \ +./target/datagen_2.12-24.04.0-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index 87a798d8d3f..1ab95091566 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -1,6 +1,6 @@ - - 4.0.0 - - - com.nvidia - rapids-4-spark-jdk-profiles_2.12 - 24.02.0 - ../../jdk-profiles/pom.xml - - - rapids-4-spark-delta-spark321db_2.12 - RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support - Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.02.0 - - - ../delta-lake/delta-spark321db - false - **/* - package - - - - - com.nvidia - rapids-4-spark-sql_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - provided - - - com.nvidia - rapids-4-spark-db-bom - ${project.version} - pom - provided - - - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-common-sources - generate-sources - - add-source - - - - - ${project.basedir}/../common/src/main/scala - ${project.basedir}/../common/src/main/databricks/scala - - - - - - - - - net.alchim31.maven - scala-maven-plugin - - - org.apache.rat - apache-rat-plugin - - - - diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCreateDeltaTableCommand.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCreateDeltaTableCommand.scala deleted file mode 100644 index bb8738c950f..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCreateDeltaTableCommand.scala +++ /dev/null @@ -1,455 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * This file was derived from DeltaDataSource.scala in the - * Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.databricks.sql.transaction.tahoe.rapids - -import com.databricks.sql.transaction.tahoe._ -import com.databricks.sql.transaction.tahoe.actions.Metadata -import com.databricks.sql.transaction.tahoe.commands.{TableCreationModes, WriteIntoDelta} -import com.databricks.sql.transaction.tahoe.metering.DeltaLogging -import com.databricks.sql.transaction.tahoe.schema.SchemaUtils -import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf -import com.nvidia.spark.rapids.RapidsConf -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} - -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException -import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.connector.catalog.Identifier -import org.apache.spark.sql.execution.command.{LeafRunnableCommand, RunnableCommand} -import org.apache.spark.sql.types.StructType - -/** - * Single entry point for all write or declaration operations for Delta tables accessed through - * the table name. - * - * @param table The table identifier for the Delta table - * @param existingTableOpt The existing table for the same identifier if exists - * @param mode The save mode when writing data. Relevant when the query is empty or set to Ignore - * with `CREATE TABLE IF NOT EXISTS`. - * @param query The query to commit into the Delta table if it exist. This can come from - * - CTAS - * - saveAsTable - */ -case class GpuCreateDeltaTableCommand( - table: CatalogTable, - existingTableOpt: Option[CatalogTable], - mode: SaveMode, - query: Option[LogicalPlan], - operation: TableCreationModes.CreationMode = TableCreationModes.Create, - tableByPath: Boolean = false, - override val output: Seq[Attribute] = Nil)(@transient rapidsConf: RapidsConf) - extends LeafRunnableCommand - with DeltaLogging { - - override def run(sparkSession: SparkSession): Seq[Row] = { - val table = this.table - - assert(table.tableType != CatalogTableType.VIEW) - assert(table.identifier.database.isDefined, "Database should've been fixed at analysis") - // There is a subtle race condition here, where the table can be created by someone else - // while this command is running. Nothing we can do about that though :( - val tableExists = existingTableOpt.isDefined - if (mode == SaveMode.Ignore && tableExists) { - // Early exit on ignore - return Nil - } else if (mode == SaveMode.ErrorIfExists && tableExists) { - throw new AnalysisException(s"DTable ${table.identifier.quotedString} already exists.") - } - - val tableWithLocation = if (tableExists) { - val existingTable = existingTableOpt.get - table.storage.locationUri match { - case Some(location) if location.getPath != existingTable.location.getPath => - val tableName = table.identifier.quotedString - throw new AnalysisException( - s"The location of the existing table $tableName is " + - s"`${existingTable.location}`. It doesn't match the specified location " + - s"`${table.location}`.") - case _ => - } - table.copy( - storage = existingTable.storage, - tableType = existingTable.tableType) - } else if (table.storage.locationUri.isEmpty) { - // We are defining a new managed table - assert(table.tableType == CatalogTableType.MANAGED) - val loc = sparkSession.sessionState.catalog.defaultTablePath(table.identifier) - table.copy(storage = table.storage.copy(locationUri = Some(loc))) - } else { - // 1. We are defining a new external table - // 2. It's a managed table which already has the location populated. This can happen in DSV2 - // CTAS flow. - table - } - - - val isManagedTable = tableWithLocation.tableType == CatalogTableType.MANAGED - val tableLocation = new Path(tableWithLocation.location) - val gpuDeltaLog = GpuDeltaLog.forTable(sparkSession, tableLocation, rapidsConf) - val hadoopConf = gpuDeltaLog.deltaLog.newDeltaHadoopConf() - val fs = tableLocation.getFileSystem(hadoopConf) - val options = new DeltaOptions(table.storage.properties, sparkSession.sessionState.conf) - var result: Seq[Row] = Nil - - recordDeltaOperation(gpuDeltaLog.deltaLog, "delta.ddl.createTable") { - val txn = gpuDeltaLog.startTransaction() - if (query.isDefined) { - // If the mode is Ignore or ErrorIfExists, the table must not exist, or we would return - // earlier. And the data should not exist either, to match the behavior of - // Ignore/ErrorIfExists mode. This means the table path should not exist or is empty. - if (mode == SaveMode.Ignore || mode == SaveMode.ErrorIfExists) { - assert(!tableExists) - // We may have failed a previous write. The retry should still succeed even if we have - // garbage data - if (txn.readVersion > -1 || !fs.exists(gpuDeltaLog.deltaLog.logPath)) { - assertPathEmpty(hadoopConf, tableWithLocation) - } - } - // We are either appending/overwriting with saveAsTable or creating a new table with CTAS or - // we are creating a table as part of a RunnableCommand - query.get match { - case writer: WriteIntoDelta => - // In the V2 Writer, methods like "replace" and "createOrReplace" implicitly mean that - // the metadata should be changed. This wasn't the behavior for DataFrameWriterV1. - if (!isV1Writer) { - replaceMetadataIfNecessary( - txn, tableWithLocation, options, writer.data.schema.asNullable) - } - val actions = writer.write(txn, sparkSession) - val op = getOperation(txn.metadata, isManagedTable, Some(options)) - txn.commit(actions, op) - case cmd: RunnableCommand => - result = cmd.run(sparkSession) - case other => - // When using V1 APIs, the `other` plan is not yet optimized, therefore, it is safe - // to once again go through analysis - val data = Dataset.ofRows(sparkSession, other) - - // In the V2 Writer, methods like "replace" and "createOrReplace" implicitly mean that - // the metadata should be changed. This wasn't the behavior for DataFrameWriterV1. - if (!isV1Writer) { - replaceMetadataIfNecessary( - txn, tableWithLocation, options, other.schema.asNullable) - } - - val actions = WriteIntoDelta( - deltaLog = gpuDeltaLog.deltaLog, - mode = mode, - options, - partitionColumns = table.partitionColumnNames, - configuration = tableWithLocation.properties + ("comment" -> table.comment.orNull), - data = data).write(txn, sparkSession) - - val op = getOperation(txn.metadata, isManagedTable, Some(options)) - txn.commit(actions, op) - } - } else { - def createTransactionLogOrVerify(): Unit = { - if (isManagedTable) { - // When creating a managed table, the table path should not exist or is empty, or - // users would be surprised to see the data, or see the data directory being dropped - // after the table is dropped. - assertPathEmpty(hadoopConf, tableWithLocation) - } - - // This is either a new table, or, we never defined the schema of the table. While it is - // unexpected that `txn.metadata.schema` to be empty when txn.readVersion >= 0, we still - // guard against it, in case of checkpoint corruption bugs. - val noExistingMetadata = txn.readVersion == -1 || txn.metadata.schema.isEmpty - if (noExistingMetadata) { - assertTableSchemaDefined(fs, tableLocation, tableWithLocation, txn, sparkSession) - assertPathEmpty(hadoopConf, tableWithLocation) - // This is a user provided schema. - // Doesn't come from a query, Follow nullability invariants. - val newMetadata = getProvidedMetadata(tableWithLocation, table.schema.json) - txn.updateMetadataForNewTable(newMetadata) - - val op = getOperation(newMetadata, isManagedTable, None) - txn.commit(Nil, op) - } else { - verifyTableMetadata(txn, tableWithLocation) - } - } - // We are defining a table using the Create or Replace Table statements. - operation match { - case TableCreationModes.Create => - require(!tableExists, "Can't recreate a table when it exists") - createTransactionLogOrVerify() - - case TableCreationModes.CreateOrReplace if !tableExists => - // If the table doesn't exist, CREATE OR REPLACE must provide a schema - if (tableWithLocation.schema.isEmpty) { - throw DeltaErrors.schemaNotProvidedException - } - createTransactionLogOrVerify() - case _ => - // When the operation is a REPLACE or CREATE OR REPLACE, then the schema shouldn't be - // empty, since we'll use the entry to replace the schema - if (tableWithLocation.schema.isEmpty) { - throw DeltaErrors.schemaNotProvidedException - } - // We need to replace - replaceMetadataIfNecessary(txn, tableWithLocation, options, tableWithLocation.schema) - // Truncate the table - val operationTimestamp = System.currentTimeMillis() - val removes = txn.filterFiles().map(_.removeWithTimestamp(operationTimestamp)) - val op = getOperation(txn.metadata, isManagedTable, None) - txn.commit(removes, op) - } - } - - // We would have failed earlier on if we couldn't ignore the existence of the table - // In addition, we just might using saveAsTable to append to the table, so ignore the creation - // if it already exists. - // Note that someone may have dropped and recreated the table in a separate location in the - // meantime... Unfortunately we can't do anything there at the moment, because Hive sucks. - logInfo(s"Table is path-based table: $tableByPath. Update catalog with mode: $operation") - updateCatalog(sparkSession, tableWithLocation, gpuDeltaLog.deltaLog.snapshot, txn) - - result - } - } - - - private def getProvidedMetadata(table: CatalogTable, schemaString: String): Metadata = { - Metadata( - description = table.comment.orNull, - schemaString = schemaString, - partitionColumns = table.partitionColumnNames, - configuration = table.properties, - createdTime = Some(System.currentTimeMillis())) - } - - private def assertPathEmpty( - hadoopConf: Configuration, - tableWithLocation: CatalogTable): Unit = { - val path = new Path(tableWithLocation.location) - val fs = path.getFileSystem(hadoopConf) - // Verify that the table location associated with CREATE TABLE doesn't have any data. Note that - // we intentionally diverge from this behavior w.r.t regular datasource tables (that silently - // overwrite any previous data) - if (fs.exists(path) && fs.listStatus(path).nonEmpty) { - throw new AnalysisException(s"Cannot create table ('${tableWithLocation.identifier}')." + - s" The associated location ('${tableWithLocation.location}') is not empty but " + - s"it's not a Delta table") - } - } - - private def assertTableSchemaDefined( - fs: FileSystem, - path: Path, - table: CatalogTable, - txn: OptimisticTransaction, - sparkSession: SparkSession): Unit = { - // Users did not specify the schema. We expect the schema exists in Delta. - if (table.schema.isEmpty) { - if (table.tableType == CatalogTableType.EXTERNAL) { - if (fs.exists(path) && fs.listStatus(path).nonEmpty) { - throw DeltaErrors.createExternalTableWithoutLogException( - path, table.identifier.quotedString, sparkSession) - } else { - throw DeltaErrors.createExternalTableWithoutSchemaException( - path, table.identifier.quotedString, sparkSession) - } - } else { - throw DeltaErrors.createManagedTableWithoutSchemaException( - table.identifier.quotedString, sparkSession) - } - } - } - - /** - * Verify against our transaction metadata that the user specified the right metadata for the - * table. - */ - private def verifyTableMetadata( - txn: OptimisticTransaction, - tableDesc: CatalogTable): Unit = { - val existingMetadata = txn.metadata - val path = new Path(tableDesc.location) - - // The delta log already exists. If they give any configuration, we'll make sure it all matches. - // Otherwise we'll just go with the metadata already present in the log. - // The schema compatibility checks will be made in `WriteIntoDelta` for CreateTable - // with a query - if (txn.readVersion > -1) { - if (tableDesc.schema.nonEmpty) { - // We check exact alignment on create table if everything is provided - // However, if in column mapping mode, we can safely ignore the related metadata fields in - // existing metadata because new table desc will not have related metadata assigned yet - val differences = SchemaUtils.reportDifferences( - DeltaColumnMapping.dropColumnMappingMetadata(existingMetadata.schema), - tableDesc.schema) - if (differences.nonEmpty) { - throw DeltaErrors.createTableWithDifferentSchemaException( - path, tableDesc.schema, existingMetadata.schema, differences) - } - } - - // If schema is specified, we must make sure the partitioning matches, even the partitioning - // is not specified. - if (tableDesc.schema.nonEmpty && - tableDesc.partitionColumnNames != existingMetadata.partitionColumns) { - throw DeltaErrors.createTableWithDifferentPartitioningException( - path, tableDesc.partitionColumnNames, existingMetadata.partitionColumns) - } - - if (tableDesc.properties.nonEmpty && tableDesc.properties != existingMetadata.configuration) { - throw DeltaErrors.createTableWithDifferentPropertiesException( - path, tableDesc.properties, existingMetadata.configuration) - } - } - } - - /** - * Based on the table creation operation, and parameters, we can resolve to different operations. - * A lot of this is needed for legacy reasons in Databricks Runtime. - * @param metadata The table metadata, which we are creating or replacing - * @param isManagedTable Whether we are creating or replacing a managed table - * @param options Write options, if this was a CTAS/RTAS - */ - private def getOperation( - metadata: Metadata, - isManagedTable: Boolean, - options: Option[DeltaOptions]): DeltaOperations.Operation = operation match { - // This is legacy saveAsTable behavior in Databricks Runtime - case TableCreationModes.Create if existingTableOpt.isDefined && query.isDefined => - DeltaOperations.Write(mode, Option(table.partitionColumnNames), options.get.replaceWhere, - options.flatMap(_.userMetadata)) - - // DataSourceV2 table creation - // CREATE TABLE (non-DataFrameWriter API) doesn't have options syntax - // (userMetadata uses SQLConf in this case) - case TableCreationModes.Create => - DeltaOperations.CreateTable(metadata, isManagedTable, query.isDefined) - - // DataSourceV2 table replace - // REPLACE TABLE (non-DataFrameWriter API) doesn't have options syntax - // (userMetadata uses SQLConf in this case) - case TableCreationModes.Replace => - DeltaOperations.ReplaceTable(metadata, isManagedTable, orCreate = false, query.isDefined) - - // Legacy saveAsTable with Overwrite mode - case TableCreationModes.CreateOrReplace if options.exists(_.replaceWhere.isDefined) => - DeltaOperations.Write(mode, Option(table.partitionColumnNames), options.get.replaceWhere, - options.flatMap(_.userMetadata)) - - // New DataSourceV2 saveAsTable with overwrite mode behavior - case TableCreationModes.CreateOrReplace => - DeltaOperations.ReplaceTable(metadata, isManagedTable, orCreate = true, query.isDefined, - options.flatMap(_.userMetadata)) - } - - /** - * Similar to getOperation, here we disambiguate the catalog alterations we need to do based - * on the table operation, and whether we have reached here through legacy code or DataSourceV2 - * code paths. - */ - private def updateCatalog( - spark: SparkSession, - table: CatalogTable, - snapshot: Snapshot, - txn: OptimisticTransaction): Unit = { - val cleaned = cleanupTableDefinition(table, snapshot) - operation match { - case _ if tableByPath => // do nothing with the metastore if this is by path - case TableCreationModes.Create => - spark.sessionState.catalog.createTable( - cleaned, - ignoreIfExists = existingTableOpt.isDefined, - validateLocation = false) - case TableCreationModes.Replace | TableCreationModes.CreateOrReplace - if existingTableOpt.isDefined => - spark.sessionState.catalog.alterTable(table) - case TableCreationModes.Replace => - val ident = Identifier.of(table.identifier.database.toArray, table.identifier.table) - throw new CannotReplaceMissingTableException(ident) - case TableCreationModes.CreateOrReplace => - spark.sessionState.catalog.createTable( - cleaned, - ignoreIfExists = false, - validateLocation = false) - } - } - - /** Clean up the information we pass on to store in the catalog. */ - private def cleanupTableDefinition(table: CatalogTable, snapshot: Snapshot): CatalogTable = { - // These actually have no effect on the usability of Delta, but feature flagging legacy - // behavior for now - val storageProps = if (conf.getConf(DeltaSQLConf.DELTA_LEGACY_STORE_WRITER_OPTIONS_AS_PROPS)) { - // Legacy behavior - table.storage - } else { - table.storage.copy(properties = Map.empty) - } - - table.copy( - schema = new StructType(), - properties = Map.empty, - partitionColumnNames = Nil, - // Remove write specific options when updating the catalog - storage = storageProps, - tracksPartitionsInCatalog = true) - } - - /** - * With DataFrameWriterV2, methods like `replace()` or `createOrReplace()` mean that the - * metadata of the table should be replaced. If overwriteSchema=false is provided with these - * methods, then we will verify that the metadata match exactly. - */ - private def replaceMetadataIfNecessary( - txn: OptimisticTransaction, - tableDesc: CatalogTable, - options: DeltaOptions, - schema: StructType): Unit = { - val isReplace = (operation == TableCreationModes.CreateOrReplace || - operation == TableCreationModes.Replace) - // If a user explicitly specifies not to overwrite the schema, during a replace, we should - // tell them that it's not supported - val dontOverwriteSchema = options.options.contains(DeltaOptions.OVERWRITE_SCHEMA_OPTION) && - !options.canOverwriteSchema - if (isReplace && dontOverwriteSchema) { - throw DeltaErrors.illegalUsageException(DeltaOptions.OVERWRITE_SCHEMA_OPTION, "replacing") - } - if (txn.readVersion > -1L && isReplace && !dontOverwriteSchema) { - // When a table already exists, and we're using the DataFrameWriterV2 API to replace - // or createOrReplace a table, we blindly overwrite the metadata. - txn.updateMetadataForNewTable(getProvidedMetadata(table, schema.json)) - } - } - - /** - * Horrible hack to differentiate between DataFrameWriterV1 and V2 so that we can decide - * what to do with table metadata. In DataFrameWriterV1, mode("overwrite").saveAsTable, - * behaves as a CreateOrReplace table, but we have asked for "overwriteSchema" as an - * explicit option to overwrite partitioning or schema information. With DataFrameWriterV2, - * the behavior asked for by the user is clearer: .createOrReplace(), which means that we - * should overwrite schema and/or partitioning. Therefore we have this hack. - */ - private def isV1Writer: Boolean = { - Thread.currentThread().getStackTrace.exists(_.toString.contains( - classOf[DataFrameWriter[_]].getCanonicalName + ".")) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeleteCommand.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeleteCommand.scala deleted file mode 100644 index ad09c775b5c..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeleteCommand.scala +++ /dev/null @@ -1,386 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * This file was derived from DeleteCommand.scala - * in the Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.databricks.sql.transaction.tahoe.rapids - -import com.databricks.sql.transaction.tahoe.{DeltaConfigs, DeltaLog, DeltaOperations, DeltaTableUtils, OptimisticTransaction} -import com.databricks.sql.transaction.tahoe.actions.{Action, AddCDCFile, FileAction} -import com.databricks.sql.transaction.tahoe.commands.{DeleteMetric, DeltaCommand} -import com.databricks.sql.transaction.tahoe.commands.MergeIntoCommand.totalBytesAndDistinctPartitionValues -import com.databricks.sql.transaction.tahoe.files.TahoeBatchFileIndex -import com.databricks.sql.transaction.tahoe.rapids.GpuDeleteCommand.{rewritingFilesMsg, FINDING_TOUCHED_FILES_MSG} -import com.nvidia.spark.rapids.delta.GpuDeltaMetricUpdateUDF - -import org.apache.spark.SparkContext -import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, EqualNullSafe, Expression, If, Literal, Not} -import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.execution.SQLExecution -import org.apache.spark.sql.execution.command.LeafRunnableCommand -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.execution.metric.SQLMetrics.{createMetric, createTimingMetric} -import org.apache.spark.sql.functions.{input_file_name, lit, typedLit, udf} -import org.apache.spark.sql.types.LongType - -trait DeleteCommandMetrics { self: LeafRunnableCommand => - @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() - - def createMetrics: Map[String, SQLMetric] = Map[String, SQLMetric]( - "numRemovedFiles" -> createMetric(sc, "number of files removed."), - "numAddedFiles" -> createMetric(sc, "number of files added."), - "numDeletedRows" -> createMetric(sc, "number of rows deleted."), - "numFilesBeforeSkipping" -> createMetric(sc, "number of files before skipping"), - "numBytesBeforeSkipping" -> createMetric(sc, "number of bytes before skipping"), - "numFilesAfterSkipping" -> createMetric(sc, "number of files after skipping"), - "numBytesAfterSkipping" -> createMetric(sc, "number of bytes after skipping"), - "numPartitionsAfterSkipping" -> createMetric(sc, "number of partitions after skipping"), - "numPartitionsAddedTo" -> createMetric(sc, "number of partitions added"), - "numPartitionsRemovedFrom" -> createMetric(sc, "number of partitions removed"), - "numCopiedRows" -> createMetric(sc, "number of rows copied"), - "numBytesAdded" -> createMetric(sc, "number of bytes added"), - "numBytesRemoved" -> createMetric(sc, "number of bytes removed"), - "executionTimeMs" -> - createTimingMetric(sc, "time taken to execute the entire operation"), - "scanTimeMs" -> - createTimingMetric(sc, "time taken to scan the files for matches"), - "rewriteTimeMs" -> - createTimingMetric(sc, "time taken to rewrite the matched files"), - "numAddedChangeFiles" -> createMetric(sc, "number of change data capture files generated"), - "changeFileBytes" -> createMetric(sc, "total size of change data capture files generated"), - "numTouchedRows" -> createMetric(sc, "number of rows touched") - ) -} - -/** - * GPU version of Delta Lake DeleteCommand. - * - * Performs a Delete based on the search condition - * - * Algorithm: - * 1) Scan all the files and determine which files have - * the rows that need to be deleted. - * 2) Traverse the affected files and rebuild the touched files. - * 3) Use the Delta protocol to atomically write the remaining rows to new files and remove - * the affected files that are identified in step 1. - */ -case class GpuDeleteCommand( - gpuDeltaLog: GpuDeltaLog, - target: LogicalPlan, - condition: Option[Expression]) - extends LeafRunnableCommand with DeltaCommand with DeleteCommandMetrics { - - override def innerChildren: Seq[QueryPlan[_]] = Seq(target) - - override val output: Seq[Attribute] = Seq(AttributeReference("num_affected_rows", LongType)()) - - override lazy val metrics = createMetrics - - final override def run(sparkSession: SparkSession): Seq[Row] = { - val deltaLog = gpuDeltaLog.deltaLog - recordDeltaOperation(gpuDeltaLog.deltaLog, "delta.dml.delete") { - deltaLog.assertRemovable() - gpuDeltaLog.withNewTransaction { txn => - val deleteActions = performDelete(sparkSession, deltaLog, txn) - if (deleteActions.nonEmpty) { - txn.commit(deleteActions, DeltaOperations.Delete(condition.toSeq)) - } - } - // Re-cache all cached plans(including this relation itself, if it's cached) that refer to - // this data source relation. - sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, target) - } - - // Adjust for deletes at partition boundaries. Deletes at partition boundaries is a metadata - // operation, therefore we don't actually have any information around how many rows were deleted - // While this info may exist in the file statistics, it's not guaranteed that we have these - // statistics. To avoid any performance regressions, we currently just return a -1 in such cases - if (metrics("numRemovedFiles").value > 0 && metrics("numDeletedRows").value == 0) { - Seq(Row(-1L)) - } else { - Seq(Row(metrics("numDeletedRows").value)) - } - } - - def performDelete( - sparkSession: SparkSession, - deltaLog: DeltaLog, - txn: OptimisticTransaction): Seq[Action] = { - import sparkSession.implicits._ - - var numRemovedFiles: Long = 0 - var numAddedFiles: Long = 0 - var numAddedChangeFiles: Long = 0 - var scanTimeMs: Long = 0 - var rewriteTimeMs: Long = 0 - var numBytesAdded: Long = 0 - var changeFileBytes: Long = 0 - var numBytesRemoved: Long = 0 - var numFilesBeforeSkipping: Long = 0 - var numBytesBeforeSkipping: Long = 0 - var numFilesAfterSkipping: Long = 0 - var numBytesAfterSkipping: Long = 0 - var numPartitionsAfterSkipping: Option[Long] = None - var numPartitionsRemovedFrom: Option[Long] = None - var numPartitionsAddedTo: Option[Long] = None - var numDeletedRows: Option[Long] = None - var numCopiedRows: Option[Long] = None - - val startTime = System.nanoTime() - val numFilesTotal = txn.snapshot.numOfFiles - - val deleteActions: Seq[Action] = condition match { - case None => - // Case 1: Delete the whole table if the condition is true - val allFiles = txn.filterFiles(Nil) - - numRemovedFiles = allFiles.size - scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - val (numBytes, numPartitions) = totalBytesAndDistinctPartitionValues(allFiles) - numBytesRemoved = numBytes - numFilesBeforeSkipping = numRemovedFiles - numBytesBeforeSkipping = numBytes - numFilesAfterSkipping = numRemovedFiles - numBytesAfterSkipping = numBytes - if (txn.metadata.partitionColumns.nonEmpty) { - numPartitionsAfterSkipping = Some(numPartitions) - numPartitionsRemovedFrom = Some(numPartitions) - numPartitionsAddedTo = Some(0) - } - val operationTimestamp = System.currentTimeMillis() - allFiles.map(_.removeWithTimestamp(operationTimestamp)) - case Some(cond) => - val (metadataPredicates, otherPredicates) = - DeltaTableUtils.splitMetadataAndDataPredicates( - cond, txn.metadata.partitionColumns, sparkSession) - - numFilesBeforeSkipping = txn.snapshot.numOfFiles - numBytesBeforeSkipping = txn.snapshot.sizeInBytes - - if (otherPredicates.isEmpty) { - // Case 2: The condition can be evaluated using metadata only. - // Delete a set of files without the need of scanning any data files. - val operationTimestamp = System.currentTimeMillis() - val candidateFiles = txn.filterFiles(metadataPredicates) - - scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - numRemovedFiles = candidateFiles.size - numBytesRemoved = candidateFiles.map(_.size).sum - numFilesAfterSkipping = candidateFiles.size - val (numCandidateBytes, numCandidatePartitions) = - totalBytesAndDistinctPartitionValues(candidateFiles) - numBytesAfterSkipping = numCandidateBytes - if (txn.metadata.partitionColumns.nonEmpty) { - numPartitionsAfterSkipping = Some(numCandidatePartitions) - numPartitionsRemovedFrom = Some(numCandidatePartitions) - numPartitionsAddedTo = Some(0) - } - candidateFiles.map(_.removeWithTimestamp(operationTimestamp)) - } else { - // Case 3: Delete the rows based on the condition. - val candidateFiles = txn.filterFiles(metadataPredicates ++ otherPredicates) - - numFilesAfterSkipping = candidateFiles.size - val (numCandidateBytes, numCandidatePartitions) = - totalBytesAndDistinctPartitionValues(candidateFiles) - numBytesAfterSkipping = numCandidateBytes - if (txn.metadata.partitionColumns.nonEmpty) { - numPartitionsAfterSkipping = Some(numCandidatePartitions) - } - - val nameToAddFileMap = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) - - val fileIndex = new TahoeBatchFileIndex( - sparkSession, "delete", candidateFiles, deltaLog, deltaLog.dataPath, txn.snapshot) - // Keep everything from the resolved target except a new TahoeFileIndex - // that only involves the affected files instead of all files. - val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) - val data = Dataset.ofRows(sparkSession, newTarget) - val deletedRowCount = metrics("numDeletedRows") - val deletedRowUdf = udf { - new GpuDeltaMetricUpdateUDF(deletedRowCount) - }.asNondeterministic() - val filesToRewrite = - withStatusCode("DELTA", FINDING_TOUCHED_FILES_MSG) { - if (candidateFiles.isEmpty) { - Array.empty[String] - } else { - data.filter(new Column(cond)) - .select(input_file_name()) - .filter(deletedRowUdf()) - .distinct() - .as[String] - .collect() - } - } - - numRemovedFiles = filesToRewrite.length - scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - if (filesToRewrite.isEmpty) { - // Case 3.1: no row matches and no delete will be triggered - if (txn.metadata.partitionColumns.nonEmpty) { - numPartitionsRemovedFrom = Some(0) - numPartitionsAddedTo = Some(0) - } - Nil - } else { - // Case 3.2: some files need an update to remove the deleted files - // Do the second pass and just read the affected files - val baseRelation = buildBaseRelation( - sparkSession, txn, "delete", deltaLog.dataPath, filesToRewrite, nameToAddFileMap) - // Keep everything from the resolved target except a new TahoeFileIndex - // that only involves the affected files instead of all files. - val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) - val targetDF = Dataset.ofRows(sparkSession, newTarget) - val filterCond = Not(EqualNullSafe(cond, Literal.TrueLiteral)) - val rewrittenActions = rewriteFiles(txn, targetDF, filterCond, filesToRewrite.length) - val (changeFiles, rewrittenFiles) = rewrittenActions - .partition(_.isInstanceOf[AddCDCFile]) - numAddedFiles = rewrittenFiles.size - val removedFiles = filesToRewrite.map(f => - getTouchedFile(deltaLog.dataPath, f, nameToAddFileMap)) - val (removedBytes, removedPartitions) = - totalBytesAndDistinctPartitionValues(removedFiles) - numBytesRemoved = removedBytes - val (rewrittenBytes, rewrittenPartitions) = - totalBytesAndDistinctPartitionValues(rewrittenFiles) - numBytesAdded = rewrittenBytes - if (txn.metadata.partitionColumns.nonEmpty) { - numPartitionsRemovedFrom = Some(removedPartitions) - numPartitionsAddedTo = Some(rewrittenPartitions) - } - numAddedChangeFiles = changeFiles.size - changeFileBytes = changeFiles.collect { case f: AddCDCFile => f.size }.sum - rewriteTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - scanTimeMs - numDeletedRows = Some(metrics("numDeletedRows").value) - numCopiedRows = Some(metrics("numTouchedRows").value - metrics("numDeletedRows").value) - - val operationTimestamp = System.currentTimeMillis() - removeFilesFromPaths(deltaLog, nameToAddFileMap, filesToRewrite, operationTimestamp) ++ - rewrittenActions - } - } - } - metrics("numRemovedFiles").set(numRemovedFiles) - metrics("numAddedFiles").set(numAddedFiles) - val executionTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - metrics("executionTimeMs").set(executionTimeMs) - metrics("scanTimeMs").set(scanTimeMs) - metrics("rewriteTimeMs").set(rewriteTimeMs) - metrics("numAddedChangeFiles").set(numAddedChangeFiles) - metrics("changeFileBytes").set(changeFileBytes) - metrics("numBytesAdded").set(numBytesAdded) - metrics("numBytesRemoved").set(numBytesRemoved) - metrics("numFilesBeforeSkipping").set(numFilesBeforeSkipping) - metrics("numBytesBeforeSkipping").set(numBytesBeforeSkipping) - metrics("numFilesAfterSkipping").set(numFilesAfterSkipping) - metrics("numBytesAfterSkipping").set(numBytesAfterSkipping) - numPartitionsAfterSkipping.foreach(metrics("numPartitionsAfterSkipping").set) - numPartitionsAddedTo.foreach(metrics("numPartitionsAddedTo").set) - numPartitionsRemovedFrom.foreach(metrics("numPartitionsRemovedFrom").set) - numCopiedRows.foreach(metrics("numCopiedRows").set) - txn.registerSQLMetrics(sparkSession, metrics) - // This is needed to make the SQL metrics visible in the Spark UI - val executionId = sparkSession.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - SQLMetrics.postDriverMetricUpdates( - sparkSession.sparkContext, executionId, metrics.values.toSeq) - - recordDeltaEvent( - deltaLog, - "delta.dml.delete.stats", - data = DeleteMetric( - condition = condition.map(_.sql).getOrElse("true"), - numFilesTotal, - numFilesAfterSkipping, - numAddedFiles, - numRemovedFiles, - numAddedFiles, - numAddedChangeFiles = numAddedChangeFiles, - numFilesBeforeSkipping, - numBytesBeforeSkipping, - numFilesAfterSkipping, - numBytesAfterSkipping, - numPartitionsAfterSkipping, - numPartitionsAddedTo, - numPartitionsRemovedFrom, - numCopiedRows, - numDeletedRows, - numBytesAdded, - numBytesRemoved, - changeFileBytes = changeFileBytes, - scanTimeMs, - rewriteTimeMs) - ) - - deleteActions - } - - /** - * Returns the list of `AddFile`s and `AddCDCFile`s that have been re-written. - */ - private def rewriteFiles( - txn: OptimisticTransaction, - baseData: DataFrame, - filterCondition: Expression, - numFilesToRewrite: Long): Seq[FileAction] = { - val shouldWriteCdc = DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(txn.metadata) - - // number of total rows that we have seen / are either copying or deleting (sum of both). - val numTouchedRows = metrics("numTouchedRows") - val numTouchedRowsUdf = udf { - new GpuDeltaMetricUpdateUDF(numTouchedRows) - }.asNondeterministic() - - withStatusCode( - "DELTA", rewritingFilesMsg(numFilesToRewrite)) { - val dfToWrite = if (shouldWriteCdc) { - import com.databricks.sql.transaction.tahoe.commands.cdc.CDCReader._ - // The logic here ends up being surprisingly elegant, with all source rows ending up in - // the output. Recall that we flipped the user-provided delete condition earlier, before the - // call to `rewriteFiles`. All rows which match this latest `filterCondition` are retained - // as table data, while all rows which don't match are removed from the rewritten table data - // but do get included in the output as CDC events. - baseData - .filter(numTouchedRowsUdf()) - .withColumn( - CDC_TYPE_COLUMN_NAME, - new Column( - If(filterCondition, typedLit[String](CDC_TYPE_NOT_CDC).expr, - lit(CDC_TYPE_DELETE).expr) - ) - ) - } else { - baseData - .filter(numTouchedRowsUdf()) - .filter(new Column(filterCondition)) - } - - txn.writeFiles(dfToWrite) - } - } -} - -object GpuDeleteCommand { - val FINDING_TOUCHED_FILES_MSG: String = "Finding files to rewrite for DELETE operation" - - def rewritingFilesMsg(numFilesToRewrite: Long): String = - s"Rewriting $numFilesToRewrite files for DELETE operation" -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaCatalog.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaCatalog.scala deleted file mode 100644 index 44b34a7ad18..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaCatalog.scala +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * This file was derived from DeltaDataSource.scala in the - * Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.databricks.sql.transaction.tahoe.rapids - -import com.databricks.sql.transaction.tahoe.{DeltaConfigs, DeltaErrors} -import com.databricks.sql.transaction.tahoe.commands.TableCreationModes -import com.databricks.sql.transaction.tahoe.sources.DeltaSourceUtils -import com.nvidia.spark.rapids.RapidsConf - -import org.apache.spark.internal.Logging -import org.apache.spark.sql.{AnalysisException, SaveMode} -import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.connector.catalog.StagingTableCatalog -import org.apache.spark.sql.execution.command.LeafRunnableCommand -import org.apache.spark.sql.execution.datasources.PartitioningUtils - -class GpuDeltaCatalog( - override val cpuCatalog: StagingTableCatalog, - override val rapidsConf: RapidsConf) - extends GpuDeltaCatalogBase with SupportsPathIdentifier with Logging { - - override protected def buildGpuCreateDeltaTableCommand( - rapidsConf: RapidsConf, - table: CatalogTable, - existingTableOpt: Option[CatalogTable], - mode: SaveMode, - query: Option[LogicalPlan], - operation: TableCreationModes.CreationMode, - tableByPath: Boolean): LeafRunnableCommand = { - GpuCreateDeltaTableCommand( - table, - existingTableOpt, - mode, - query, - operation, - tableByPath = tableByPath - )(rapidsConf) - } - - override protected def getExistingTableIfExists(table: TableIdentifier): Option[CatalogTable] = { - // If this is a path identifier, we cannot return an existing CatalogTable. The Create command - // will check the file system itself - if (isPathIdentifier(table)) return None - val tableExists = catalog.tableExists(table) - if (tableExists) { - val oldTable = catalog.getTableMetadata(table) - if (oldTable.tableType == CatalogTableType.VIEW) { - throw new AnalysisException( - s"$table is a view. You may not write data into a view.") - } - if (!DeltaSourceUtils.isDeltaTable(oldTable.provider)) { - throw new AnalysisException(s"$table is not a Delta table. Please drop this " + - "table first if you would like to recreate it with Delta Lake.") - } - Some(oldTable) - } else { - None - } - } - - override protected def verifyTableAndSolidify( - tableDesc: CatalogTable, - query: Option[LogicalPlan]): CatalogTable = { - - if (tableDesc.bucketSpec.isDefined) { - throw DeltaErrors.operationNotSupportedException("Bucketing", tableDesc.identifier) - } - - val schema = query.map { plan => - assert(tableDesc.schema.isEmpty, "Can't specify table schema in CTAS.") - plan.schema.asNullable - }.getOrElse(tableDesc.schema) - - PartitioningUtils.validatePartitionColumn( - schema, - tableDesc.partitionColumnNames, - caseSensitive = false) // Delta is case insensitive - - val validatedConfigurations = DeltaConfigs.validateConfigurations(tableDesc.properties) - - val db = tableDesc.identifier.database.getOrElse(catalog.getCurrentDatabase) - val tableIdentWithDB = tableDesc.identifier.copy(database = Some(db)) - tableDesc.copy( - identifier = tableIdentWithDB, - schema = schema, - properties = validatedConfigurations) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala deleted file mode 100644 index 84a86807c15..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * This file was derived from DoAutoCompaction.scala - * from https://github.com/delta-io/delta/pull/1156 - * in the Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.databricks.sql.transaction.tahoe.rapids - -import com.databricks.sql.transaction.tahoe._ -import com.databricks.sql.transaction.tahoe.actions.Action -import com.databricks.sql.transaction.tahoe.hooks.PostCommitHook -import com.databricks.sql.transaction.tahoe.metering.DeltaLogging - -import org.apache.spark.sql.SparkSession - -object GpuDoAutoCompaction extends PostCommitHook - with DeltaLogging - with Serializable { - override val name: String = "Triggers compaction if necessary" - - override def run(spark: SparkSession, - txn: OptimisticTransactionImpl, - committedActions: Seq[Action]): Unit = { - val gpuTxn = txn.asInstanceOf[GpuOptimisticTransaction] - val newTxn = new GpuDeltaLog(gpuTxn.deltaLog, gpuTxn.rapidsConf).startTransaction() - // Note: The Databricks AutoCompact PostCommitHook cannot be used here - // (with a GpuOptimisticTransaction). It appears that AutoCompact creates a new transaction, - // thereby circumventing GpuOptimisticTransaction (which intercepts Parquet writes - // to go through the GPU). - new GpuOptimizeExecutor(spark, newTxn, Seq.empty, Seq.empty, committedActions).optimize() - } - - override def handleError(error: Throwable, version: Long): Unit = - throw DeltaErrors.postCommitHookFailedException(this, version, name, error) -} \ No newline at end of file diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala deleted file mode 100644 index bed8a457ceb..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala +++ /dev/null @@ -1,1170 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * This file was derived from MergeIntoCommand.scala - * in the Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.databricks.sql.transaction.tahoe.rapids - -import java.util.concurrent.TimeUnit - -import scala.collection.JavaConverters._ -import scala.collection.mutable - -import com.databricks.sql.transaction.tahoe._ -import com.databricks.sql.transaction.tahoe.actions.{AddCDCFile, AddFile, FileAction} -import com.databricks.sql.transaction.tahoe.commands.DeltaCommand -import com.databricks.sql.transaction.tahoe.schema.ImplicitMetadataOperation -import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf -import com.databricks.sql.transaction.tahoe.util.{AnalysisHelper, SetAccumulator} -import com.fasterxml.jackson.databind.annotation.JsonDeserialize -import com.nvidia.spark.rapids.{BaseExprMeta, GpuOverrides, RapidsConf} -import com.nvidia.spark.rapids.delta._ - -import org.apache.spark.SparkContext -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BasePredicate, Expression, Literal, NamedExpression, PredicateHelper, UnsafeProjection} -import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate -import org.apache.spark.sql.catalyst.plans.logical.{DeltaMergeIntoClause, DeltaMergeIntoDeleteClause, DeltaMergeIntoInsertClause, DeltaMergeIntoMatchedClause, DeltaMergeIntoUpdateClause, LogicalPlan, Project} -import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap -import org.apache.spark.sql.execution.SQLExecution -import org.apache.spark.sql.execution.command.LeafRunnableCommand -import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{DataTypes, LongType, StringType, StructType} - -case class GpuMergeDataSizes( - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - rows: Option[Long] = None, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - files: Option[Long] = None, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - bytes: Option[Long] = None, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - partitions: Option[Long] = None) - -/** - * Represents the state of a single merge clause: - * - merge clause's (optional) predicate - * - action type (insert, update, delete) - * - action's expressions - */ -case class GpuMergeClauseStats( - condition: Option[String], - actionType: String, - actionExpr: Seq[String]) - -object GpuMergeClauseStats { - def apply(mergeClause: DeltaMergeIntoClause): GpuMergeClauseStats = { - GpuMergeClauseStats( - condition = mergeClause.condition.map(_.sql), - mergeClause.clauseType.toLowerCase(), - actionExpr = mergeClause.actions.map(_.sql)) - } -} - -/** State for a GPU merge operation */ -case class GpuMergeStats( - // Merge condition expression - conditionExpr: String, - - // Expressions used in old MERGE stats, now always Null - updateConditionExpr: String, - updateExprs: Seq[String], - insertConditionExpr: String, - insertExprs: Seq[String], - deleteConditionExpr: String, - - // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED - matchedStats: Seq[GpuMergeClauseStats], - notMatchedStats: Seq[GpuMergeClauseStats], - - // Data sizes of source and target at different stages of processing - source: GpuMergeDataSizes, - targetBeforeSkipping: GpuMergeDataSizes, - targetAfterSkipping: GpuMergeDataSizes, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - sourceRowsInSecondScan: Option[Long], - - // Data change sizes - targetFilesRemoved: Long, - targetFilesAdded: Long, - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - targetChangeFilesAdded: Option[Long], - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - targetChangeFileBytes: Option[Long], - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - targetBytesRemoved: Option[Long], - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - targetBytesAdded: Option[Long], - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - targetPartitionsRemovedFrom: Option[Long], - @JsonDeserialize(contentAs = classOf[java.lang.Long]) - targetPartitionsAddedTo: Option[Long], - targetRowsCopied: Long, - targetRowsUpdated: Long, - targetRowsInserted: Long, - targetRowsDeleted: Long -) - -object GpuMergeStats { - - def fromMergeSQLMetrics( - metrics: Map[String, SQLMetric], - condition: Expression, - matchedClauses: Seq[DeltaMergeIntoMatchedClause], - notMatchedClauses: Seq[DeltaMergeIntoInsertClause], - isPartitioned: Boolean): GpuMergeStats = { - - def metricValueIfPartitioned(metricName: String): Option[Long] = { - if (isPartitioned) Some(metrics(metricName).value) else None - } - - GpuMergeStats( - // Merge condition expression - conditionExpr = condition.sql, - - // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED - matchedStats = matchedClauses.map(GpuMergeClauseStats(_)), - notMatchedStats = notMatchedClauses.map(GpuMergeClauseStats(_)), - - // Data sizes of source and target at different stages of processing - source = GpuMergeDataSizes(rows = Some(metrics("numSourceRows").value)), - targetBeforeSkipping = - GpuMergeDataSizes( - files = Some(metrics("numTargetFilesBeforeSkipping").value), - bytes = Some(metrics("numTargetBytesBeforeSkipping").value)), - targetAfterSkipping = - GpuMergeDataSizes( - files = Some(metrics("numTargetFilesAfterSkipping").value), - bytes = Some(metrics("numTargetBytesAfterSkipping").value), - partitions = metricValueIfPartitioned("numTargetPartitionsAfterSkipping")), - sourceRowsInSecondScan = - metrics.get("numSourceRowsInSecondScan").map(_.value).filter(_ >= 0), - - // Data change sizes - targetFilesAdded = metrics("numTargetFilesAdded").value, - targetChangeFilesAdded = metrics.get("numTargetChangeFilesAdded").map(_.value), - targetChangeFileBytes = metrics.get("numTargetChangeFileBytes").map(_.value), - targetFilesRemoved = metrics("numTargetFilesRemoved").value, - targetBytesAdded = Some(metrics("numTargetBytesAdded").value), - targetBytesRemoved = Some(metrics("numTargetBytesRemoved").value), - targetPartitionsRemovedFrom = metricValueIfPartitioned("numTargetPartitionsRemovedFrom"), - targetPartitionsAddedTo = metricValueIfPartitioned("numTargetPartitionsAddedTo"), - targetRowsCopied = metrics("numTargetRowsCopied").value, - targetRowsUpdated = metrics("numTargetRowsUpdated").value, - targetRowsInserted = metrics("numTargetRowsInserted").value, - targetRowsDeleted = metrics("numTargetRowsDeleted").value, - - // Deprecated fields - updateConditionExpr = null, - updateExprs = null, - insertConditionExpr = null, - insertExprs = null, - deleteConditionExpr = null) - } -} - -/** - * GPU version of Delta Lake's MergeIntoCommand. - * - * Performs a merge of a source query/table into a Delta table. - * - * Issues an error message when the ON search_condition of the MERGE statement can match - * a single row from the target table with multiple rows of the source table-reference. - * - * Algorithm: - * - * Phase 1: Find the input files in target that are touched by the rows that satisfy - * the condition and verify that no two source rows match with the same target row. - * This is implemented as an inner-join using the given condition. See [[findTouchedFiles]] - * for more details. - * - * Phase 2: Read the touched files again and write new files with updated and/or inserted rows. - * - * Phase 3: Use the Delta protocol to atomically remove the touched files and add the new files. - * - * @param source Source data to merge from - * @param target Target table to merge into - * @param gpuDeltaLog Delta log to use - * @param condition Condition for a source row to match with a target row - * @param matchedClauses All info related to matched clauses. - * @param notMatchedClauses All info related to not matched clause. - * @param migratedSchema The final schema of the target - may be changed by schema evolution. - */ -case class GpuMergeIntoCommand( - @transient source: LogicalPlan, - @transient target: LogicalPlan, - @transient gpuDeltaLog: GpuDeltaLog, - condition: Expression, - matchedClauses: Seq[DeltaMergeIntoMatchedClause], - notMatchedClauses: Seq[DeltaMergeIntoInsertClause], - migratedSchema: Option[StructType])( - @transient val rapidsConf: RapidsConf) - extends LeafRunnableCommand - with DeltaCommand with PredicateHelper with AnalysisHelper with ImplicitMetadataOperation { - - import GpuMergeIntoCommand._ - - import SQLMetrics._ - import com.databricks.sql.transaction.tahoe.commands.cdc.CDCReader._ - - override val otherCopyArgs: Seq[AnyRef] = Seq(rapidsConf) - - override val canMergeSchema: Boolean = conf.getConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE) - override val canOverwriteSchema: Boolean = false - - override val output: Seq[Attribute] = Seq( - AttributeReference("num_affected_rows", LongType)(), - AttributeReference("num_updated_rows", LongType)(), - AttributeReference("num_deleted_rows", LongType)(), - AttributeReference("num_inserted_rows", LongType)()) - - @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() - @transient private lazy val targetDeltaLog: DeltaLog = gpuDeltaLog.deltaLog - /** - * Map to get target output attributes by name. - * The case sensitivity of the map is set accordingly to Spark configuration. - */ - @transient private lazy val targetOutputAttributesMap: Map[String, Attribute] = { - val attrMap: Map[String, Attribute] = target - .outputSet.view - .map(attr => attr.name -> attr).toMap - if (conf.caseSensitiveAnalysis) { - attrMap - } else { - CaseInsensitiveMap(attrMap) - } - } - - /** Whether this merge statement has only a single insert (NOT MATCHED) clause. */ - private def isSingleInsertOnly: Boolean = matchedClauses.isEmpty && notMatchedClauses.length == 1 - /** Whether this merge statement has only MATCHED clauses. */ - private def isMatchedOnly: Boolean = notMatchedClauses.isEmpty && matchedClauses.nonEmpty - - // We over-count numTargetRowsDeleted when there are multiple matches; - // this is the amount of the overcount, so we can subtract it to get a correct final metric. - private var multipleMatchDeleteOnlyOvercount: Option[Long] = None - - override lazy val metrics = Map[String, SQLMetric]( - "numSourceRows" -> createMetric(sc, "number of source rows"), - "numSourceRowsInSecondScan" -> - createMetric(sc, "number of source rows (during repeated scan)"), - "numTargetRowsCopied" -> createMetric(sc, "number of target rows rewritten unmodified"), - "numTargetRowsInserted" -> createMetric(sc, "number of inserted rows"), - "numTargetRowsUpdated" -> createMetric(sc, "number of updated rows"), - "numTargetRowsDeleted" -> createMetric(sc, "number of deleted rows"), - "numTargetFilesBeforeSkipping" -> createMetric(sc, "number of target files before skipping"), - "numTargetFilesAfterSkipping" -> createMetric(sc, "number of target files after skipping"), - "numTargetFilesRemoved" -> createMetric(sc, "number of files removed to target"), - "numTargetFilesAdded" -> createMetric(sc, "number of files added to target"), - "numTargetChangeFilesAdded" -> - createMetric(sc, "number of change data capture files generated"), - "numTargetChangeFileBytes" -> - createMetric(sc, "total size of change data capture files generated"), - "numTargetBytesBeforeSkipping" -> createMetric(sc, "number of target bytes before skipping"), - "numTargetBytesAfterSkipping" -> createMetric(sc, "number of target bytes after skipping"), - "numTargetBytesRemoved" -> createMetric(sc, "number of target bytes removed"), - "numTargetBytesAdded" -> createMetric(sc, "number of target bytes added"), - "numTargetPartitionsAfterSkipping" -> - createMetric(sc, "number of target partitions after skipping"), - "numTargetPartitionsRemovedFrom" -> - createMetric(sc, "number of target partitions from which files were removed"), - "numTargetPartitionsAddedTo" -> - createMetric(sc, "number of target partitions to which files were added"), - "executionTimeMs" -> - createMetric(sc, "time taken to execute the entire operation"), - "scanTimeMs" -> - createMetric(sc, "time taken to scan the files for matches"), - "rewriteTimeMs" -> - createMetric(sc, "time taken to rewrite the matched files")) - - override def run(spark: SparkSession): Seq[Row] = { - recordDeltaOperation(targetDeltaLog, "delta.dml.merge") { - val startTime = System.nanoTime() - gpuDeltaLog.withNewTransaction { deltaTxn => - if (target.schema.size != deltaTxn.metadata.schema.size) { - throw DeltaErrors.schemaChangedSinceAnalysis( - atAnalysis = target.schema, latestSchema = deltaTxn.metadata.schema) - } - - if (canMergeSchema) { - updateMetadata( - spark, deltaTxn, migratedSchema.getOrElse(target.schema), - deltaTxn.metadata.partitionColumns, deltaTxn.metadata.configuration, - isOverwriteMode = false, rearrangeOnly = false) - } - - val deltaActions = { - if (isSingleInsertOnly && spark.conf.get(DeltaSQLConf.MERGE_INSERT_ONLY_ENABLED)) { - writeInsertsOnlyWhenNoMatchedClauses(spark, deltaTxn) - } else { - val filesToRewrite = findTouchedFiles(spark, deltaTxn) - val newWrittenFiles = withStatusCode("DELTA", "Writing merged data") { - writeAllChanges(spark, deltaTxn, filesToRewrite) - } - filesToRewrite.map(_.remove) ++ newWrittenFiles - } - } - - // Metrics should be recorded before commit (where they are written to delta logs). - metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) - deltaTxn.registerSQLMetrics(spark, metrics) - - // This is a best-effort sanity check. - if (metrics("numSourceRowsInSecondScan").value >= 0 && - metrics("numSourceRows").value != metrics("numSourceRowsInSecondScan").value) { - log.warn(s"Merge source has ${metrics("numSourceRows").value} rows in initial scan but " + - s"${metrics("numSourceRowsInSecondScan").value} rows in second scan") - if (conf.getConf(DeltaSQLConf.MERGE_FAIL_IF_SOURCE_CHANGED)) { - throw DeltaErrors.sourceNotDeterministicInMergeException(spark) - } - } - - deltaTxn.commit( - deltaActions, - DeltaOperations.Merge( - Option(condition), - matchedClauses.map(DeltaOperations.MergePredicate(_)), - notMatchedClauses.map(DeltaOperations.MergePredicate(_)))) - - // Record metrics - val stats = GpuMergeStats.fromMergeSQLMetrics( - metrics, condition, matchedClauses, notMatchedClauses, - deltaTxn.metadata.partitionColumns.nonEmpty) - recordDeltaEvent(targetDeltaLog, "delta.dml.merge.stats", data = stats) - - } - spark.sharedState.cacheManager.recacheByPlan(spark, target) - } - // This is needed to make the SQL metrics visible in the Spark UI. Also this needs - // to be outside the recordMergeOperation because this method will update some metric. - val executionId = spark.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - SQLMetrics.postDriverMetricUpdates(spark.sparkContext, executionId, metrics.values.toSeq) - Seq(Row(metrics("numTargetRowsUpdated").value + metrics("numTargetRowsDeleted").value + - metrics("numTargetRowsInserted").value, metrics("numTargetRowsUpdated").value, - metrics("numTargetRowsDeleted").value, metrics("numTargetRowsInserted").value)) - } - - /** - * Find the target table files that contain the rows that satisfy the merge condition. This is - * implemented as an inner-join between the source query/table and the target table using - * the merge condition. - */ - private def findTouchedFiles( - spark: SparkSession, - deltaTxn: OptimisticTransaction - ): Seq[AddFile] = recordMergeOperation(sqlMetricName = "scanTimeMs") { - - // Accumulator to collect all the distinct touched files - val touchedFilesAccum = new SetAccumulator[String]() - spark.sparkContext.register(touchedFilesAccum, TOUCHED_FILES_ACCUM_NAME) - - // UDFs to records touched files names and add them to the accumulator - val recordTouchedFileName = udf(new GpuDeltaRecordTouchedFileNameUDF(touchedFilesAccum)) - .asNondeterministic() - - // Skip data based on the merge condition - val targetOnlyPredicates = - splitConjunctivePredicates(condition).filter(_.references.subsetOf(target.outputSet)) - val dataSkippedFiles = deltaTxn.filterFiles(targetOnlyPredicates) - - // UDF to increment metrics - val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRows") - val sourceDF = Dataset.ofRows(spark, source) - .filter(new Column(incrSourceRowCountExpr)) - - // Apply inner join to between source and target using the merge condition to find matches - // In addition, we attach two columns - // - a monotonically increasing row id for target rows to later identify whether the same - // target row is modified by multiple user or not - // - the target file name the row is from to later identify the files touched by matched rows - val targetDF = Dataset.ofRows(spark, buildTargetPlanWithFiles(deltaTxn, dataSkippedFiles)) - .withColumn(ROW_ID_COL, monotonically_increasing_id()) - .withColumn(FILE_NAME_COL, input_file_name()) - val joinToFindTouchedFiles = sourceDF.join(targetDF, new Column(condition), "inner") - - // Process the matches from the inner join to record touched files and find multiple matches - val collectTouchedFiles = joinToFindTouchedFiles - .select(col(ROW_ID_COL), recordTouchedFileName(col(FILE_NAME_COL)).as("one")) - - // Calculate frequency of matches per source row - val matchedRowCounts = collectTouchedFiles.groupBy(ROW_ID_COL).agg(sum("one").as("count")) - - // Get multiple matches and simultaneously collect (using touchedFilesAccum) the file names - // multipleMatchCount = # of target rows with more than 1 matching source row (duplicate match) - // multipleMatchSum = total # of duplicate matched rows - import spark.implicits._ - val (multipleMatchCount, multipleMatchSum) = matchedRowCounts - .filter("count > 1") - .select(coalesce(count("*"), lit(0)), coalesce(sum("count"), lit(0))) - .as[(Long, Long)] - .collect() - .head - - val hasMultipleMatches = multipleMatchCount > 0 - - // Throw error if multiple matches are ambiguous or cannot be computed correctly. - val canBeComputedUnambiguously = { - // Multiple matches are not ambiguous when there is only one unconditional delete as - // all the matched row pairs in the 2nd join in `writeAllChanges` will get deleted. - val isUnconditionalDelete = matchedClauses.headOption match { - case Some(DeltaMergeIntoDeleteClause(None)) => true - case _ => false - } - matchedClauses.size == 1 && isUnconditionalDelete - } - - if (hasMultipleMatches && !canBeComputedUnambiguously) { - throw DeltaErrors.multipleSourceRowMatchingTargetRowInMergeException(spark) - } - - if (hasMultipleMatches) { - // This is only allowed for delete-only queries. - // This query will count the duplicates for numTargetRowsDeleted in Job 2, - // because we count matches after the join and not just the target rows. - // We have to compensate for this by subtracting the duplicates later, - // so we need to record them here. - val duplicateCount = multipleMatchSum - multipleMatchCount - multipleMatchDeleteOnlyOvercount = Some(duplicateCount) - } - - // Get the AddFiles using the touched file names. - val touchedFileNames = touchedFilesAccum.value.iterator().asScala.toSeq - logTrace(s"findTouchedFiles: matched files:\n\t${touchedFileNames.mkString("\n\t")}") - - val nameToAddFileMap = generateCandidateFileMap(targetDeltaLog.dataPath, dataSkippedFiles) - val touchedAddFiles = touchedFileNames.map(f => - getTouchedFile(targetDeltaLog.dataPath, f, nameToAddFileMap)) - - // When the target table is empty, and the optimizer optimized away the join entirely - // numSourceRows will be incorrectly 0. We need to scan the source table once to get the correct - // metric here. - if (metrics("numSourceRows").value == 0 && - (dataSkippedFiles.isEmpty || targetDF.take(1).isEmpty)) { - val numSourceRows = sourceDF.count() - metrics("numSourceRows").set(numSourceRows) - } - - // Update metrics - metrics("numTargetFilesBeforeSkipping") += deltaTxn.snapshot.numOfFiles - metrics("numTargetBytesBeforeSkipping") += deltaTxn.snapshot.sizeInBytes - val (afterSkippingBytes, afterSkippingPartitions) = - totalBytesAndDistinctPartitionValues(dataSkippedFiles) - metrics("numTargetFilesAfterSkipping") += dataSkippedFiles.size - metrics("numTargetBytesAfterSkipping") += afterSkippingBytes - metrics("numTargetPartitionsAfterSkipping") += afterSkippingPartitions - val (removedBytes, removedPartitions) = totalBytesAndDistinctPartitionValues(touchedAddFiles) - metrics("numTargetFilesRemoved") += touchedAddFiles.size - metrics("numTargetBytesRemoved") += removedBytes - metrics("numTargetPartitionsRemovedFrom") += removedPartitions - touchedAddFiles - } - - /** - * This is an optimization of the case when there is no update clause for the merge. - * We perform an left anti join on the source data to find the rows to be inserted. - * - * This will currently only optimize for the case when there is a _single_ notMatchedClause. - */ - private def writeInsertsOnlyWhenNoMatchedClauses( - spark: SparkSession, - deltaTxn: OptimisticTransaction - ): Seq[FileAction] = recordMergeOperation(sqlMetricName = "rewriteTimeMs") { - - // UDFs to update metrics - val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRows") - val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted") - - val outputColNames = getTargetOutputCols(deltaTxn).map(_.name) - // we use head here since we know there is only a single notMatchedClause - val outputExprs = notMatchedClauses.head.resolvedActions.map(_.expr) - val outputCols = outputExprs.zip(outputColNames).map { case (expr, name) => - new Column(Alias(expr, name)()) - } - - // source DataFrame - val sourceDF = Dataset.ofRows(spark, source) - .filter(new Column(incrSourceRowCountExpr)) - .filter(new Column(notMatchedClauses.head.condition.getOrElse(Literal.TrueLiteral))) - - // Skip data based on the merge condition - val conjunctivePredicates = splitConjunctivePredicates(condition) - val targetOnlyPredicates = - conjunctivePredicates.filter(_.references.subsetOf(target.outputSet)) - val dataSkippedFiles = deltaTxn.filterFiles(targetOnlyPredicates) - - // target DataFrame - val targetDF = Dataset.ofRows( - spark, buildTargetPlanWithFiles(deltaTxn, dataSkippedFiles)) - - val insertDf = sourceDF.join(targetDF, new Column(condition), "leftanti") - .select(outputCols: _*) - .filter(new Column(incrInsertedCountExpr)) - - val newFiles = deltaTxn - .writeFiles(repartitionIfNeeded(spark, insertDf, deltaTxn.metadata.partitionColumns)) - - // Update metrics - metrics("numTargetFilesBeforeSkipping") += deltaTxn.snapshot.numOfFiles - metrics("numTargetBytesBeforeSkipping") += deltaTxn.snapshot.sizeInBytes - val (afterSkippingBytes, afterSkippingPartitions) = - totalBytesAndDistinctPartitionValues(dataSkippedFiles) - metrics("numTargetFilesAfterSkipping") += dataSkippedFiles.size - metrics("numTargetBytesAfterSkipping") += afterSkippingBytes - metrics("numTargetPartitionsAfterSkipping") += afterSkippingPartitions - metrics("numTargetFilesRemoved") += 0 - metrics("numTargetBytesRemoved") += 0 - metrics("numTargetPartitionsRemovedFrom") += 0 - val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) - metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) - metrics("numTargetBytesAdded") += addedBytes - metrics("numTargetPartitionsAddedTo") += addedPartitions - newFiles - } - - /** - * Write new files by reading the touched files and updating/inserting data using the source - * query/table. This is implemented using a full|right-outer-join using the merge condition. - * - * Note that unlike the insert-only code paths with just one control column INCR_ROW_COUNT_COL, - * this method has two additional control columns ROW_DROPPED_COL for dropping deleted rows and - * CDC_TYPE_COL_NAME used for handling CDC when enabled. - */ - private def writeAllChanges( - spark: SparkSession, - deltaTxn: OptimisticTransaction, - filesToRewrite: Seq[AddFile] - ): Seq[FileAction] = recordMergeOperation(sqlMetricName = "rewriteTimeMs") { - import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} - - val cdcEnabled = DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(deltaTxn.metadata) - - var targetOutputCols = getTargetOutputCols(deltaTxn) - var outputRowSchema = deltaTxn.metadata.schema - - // When we have duplicate matches (only allowed when the whenMatchedCondition is a delete with - // no match condition) we will incorrectly generate duplicate CDC rows. - // Duplicate matches can be due to: - // - Duplicate rows in the source w.r.t. the merge condition - // - A target-only or source-only merge condition, which essentially turns our join into a cross - // join with the target/source satisfiying the merge condition. - // These duplicate matches are dropped from the main data output since this is a delete - // operation, but the duplicate CDC rows are not removed by default. - // See https://github.com/delta-io/delta/issues/1274 - - // We address this specific scenario by adding row ids to the target before performing our join. - // There should only be one CDC delete row per target row so we can use these row ids to dedupe - // the duplicate CDC delete rows. - - // We also need to address the scenario when there are duplicate matches with delete and we - // insert duplicate rows. Here we need to additionally add row ids to the source before the - // join to avoid dropping these valid duplicate inserted rows and their corresponding cdc rows. - - // When there is an insert clause, we set SOURCE_ROW_ID_COL=null for all delete rows because we - // need to drop the duplicate matches. - val isDeleteWithDuplicateMatchesAndCdc = multipleMatchDeleteOnlyOvercount.nonEmpty && cdcEnabled - - // Generate a new logical plan that has same output attributes exprIds as the target plan. - // This allows us to apply the existing resolved update/insert expressions. - val newTarget = buildTargetPlanWithFiles(deltaTxn, filesToRewrite) - val joinType = if (isMatchedOnly && - spark.conf.get(DeltaSQLConf.MERGE_MATCHED_ONLY_ENABLED)) { - "rightOuter" - } else { - "fullOuter" - } - - logDebug(s"""writeAllChanges using $joinType join: - | source.output: ${source.outputSet} - | target.output: ${target.outputSet} - | condition: $condition - | newTarget.output: ${newTarget.outputSet} - """.stripMargin) - - // UDFs to update metrics - // Make UDFs that appear in the custom join processor node deterministic, as they always - // return true and update a metric. Catalyst precludes non-deterministic UDFs that are not - // allowed outside a very specific set of Catalyst nodes (Project, Filter, Window, Aggregate). - val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRowsInSecondScan") - val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated", deterministic = true) - val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted", deterministic = true) - val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied", deterministic = true) - val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted", deterministic = true) - - // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields - // with value `true`, one to each side of the join. Whether this field is null or not after - // the outer join, will allow us to identify whether the resultant joined row was a - // matched inner result or an unmatched result with null on one side. - // We add row IDs to the targetDF if we have a delete-when-matched clause with duplicate - // matches and CDC is enabled, and additionally add row IDs to the source if we also have an - // insert clause. See above at isDeleteWithDuplicateMatchesAndCdc definition for more details. - var sourceDF = Dataset.ofRows(spark, source) - .withColumn(SOURCE_ROW_PRESENT_COL, new Column(incrSourceRowCountExpr)) - var targetDF = Dataset.ofRows(spark, newTarget) - .withColumn(TARGET_ROW_PRESENT_COL, lit(true)) - if (isDeleteWithDuplicateMatchesAndCdc) { - targetDF = targetDF.withColumn(TARGET_ROW_ID_COL, monotonically_increasing_id()) - if (notMatchedClauses.nonEmpty) { // insert clause - sourceDF = sourceDF.withColumn(SOURCE_ROW_ID_COL, monotonically_increasing_id()) - } - } - val joinedDF = sourceDF.join(targetDF, new Column(condition), joinType) - val joinedPlan = joinedDF.queryExecution.analyzed - - def resolveOnJoinedPlan(exprs: Seq[Expression]): Seq[Expression] = { - tryResolveReferencesForExpressions(spark, exprs, joinedPlan) - } - - // ==== Generate the expressions to process full-outer join output and generate target rows ==== - // If there are N columns in the target table, there will be N + 3 columns after processing - // - N columns for target table - // - ROW_DROPPED_COL to define whether the generated row should dropped or written - // - INCR_ROW_COUNT_COL containing a UDF to update the output row row counter - // - CDC_TYPE_COLUMN_NAME containing the type of change being performed in a particular row - - // To generate these N + 3 columns, we will generate N + 3 expressions and apply them to the - // rows in the joinedDF. The CDC column will be either used for CDC generation or dropped before - // performing the final write, and the other two will always be dropped after executing the - // metrics UDF and filtering on ROW_DROPPED_COL. - - // We produce rows for both the main table data (with CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC), - // and rows for the CDC data which will be output to CDCReader.CDC_LOCATION. - // See [[CDCReader]] for general details on how partitioning on the CDC type column works. - - // In the following two functions `matchedClauseOutput` and `notMatchedClauseOutput`, we - // produce a Seq[Expression] for each intended output row. - // Depending on the clause and whether CDC is enabled, we output between 0 and 3 rows, as a - // Seq[Seq[Expression]] - - // There is one corner case outlined above at isDeleteWithDuplicateMatchesAndCdc definition. - // When we have a delete-ONLY merge with duplicate matches we have N + 4 columns: - // N target cols, TARGET_ROW_ID_COL, ROW_DROPPED_COL, INCR_ROW_COUNT_COL, CDC_TYPE_COLUMN_NAME - // When we have a delete-when-matched merge with duplicate matches + an insert clause, we have - // N + 5 columns: - // N target cols, TARGET_ROW_ID_COL, SOURCE_ROW_ID_COL, ROW_DROPPED_COL, INCR_ROW_COUNT_COL, - // CDC_TYPE_COLUMN_NAME - // These ROW_ID_COL will always be dropped before the final write. - - if (isDeleteWithDuplicateMatchesAndCdc) { - targetOutputCols = targetOutputCols :+ UnresolvedAttribute(TARGET_ROW_ID_COL) - outputRowSchema = outputRowSchema.add(TARGET_ROW_ID_COL, DataTypes.LongType) - if (notMatchedClauses.nonEmpty) { // there is an insert clause, make SRC_ROW_ID_COL=null - targetOutputCols = targetOutputCols :+ Alias(Literal(null), SOURCE_ROW_ID_COL)() - outputRowSchema = outputRowSchema.add(SOURCE_ROW_ID_COL, DataTypes.LongType) - } - } - - if (cdcEnabled) { - outputRowSchema = outputRowSchema - .add(ROW_DROPPED_COL, DataTypes.BooleanType) - .add(INCR_ROW_COUNT_COL, DataTypes.BooleanType) - .add(CDC_TYPE_COLUMN_NAME, DataTypes.StringType) - } - - def matchedClauseOutput(clause: DeltaMergeIntoMatchedClause): Seq[Seq[Expression]] = { - val exprs = clause match { - case u: DeltaMergeIntoUpdateClause => - // Generate update expressions and set ROW_DELETED_COL = false and - // CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC - val mainDataOutput = u.resolvedActions.map(_.expr) :+ FalseLiteral :+ - incrUpdatedCountExpr :+ CDC_TYPE_NOT_CDC_LITERAL - if (cdcEnabled) { - // For update preimage, we have do a no-op copy with ROW_DELETED_COL = false and - // CDC_TYPE_COLUMN_NAME = CDC_TYPE_UPDATE_PREIMAGE and INCR_ROW_COUNT_COL as a no-op - // (because the metric will be incremented in `mainDataOutput`) - val preImageOutput = targetOutputCols :+ FalseLiteral :+ TrueLiteral :+ - Literal(CDC_TYPE_UPDATE_PREIMAGE) - // For update postimage, we have the same expressions as for mainDataOutput but with - // INCR_ROW_COUNT_COL as a no-op (because the metric will be incremented in - // `mainDataOutput`), and CDC_TYPE_COLUMN_NAME = CDC_TYPE_UPDATE_POSTIMAGE - val postImageOutput = mainDataOutput.dropRight(2) :+ TrueLiteral :+ - Literal(CDC_TYPE_UPDATE_POSTIMAGE) - Seq(mainDataOutput, preImageOutput, postImageOutput) - } else { - Seq(mainDataOutput) - } - case _: DeltaMergeIntoDeleteClause => - // Generate expressions to set the ROW_DELETED_COL = true and CDC_TYPE_COLUMN_NAME = - // CDC_TYPE_NOT_CDC - val mainDataOutput = targetOutputCols :+ TrueLiteral :+ incrDeletedCountExpr :+ - CDC_TYPE_NOT_CDC_LITERAL - if (cdcEnabled) { - // For delete we do a no-op copy with ROW_DELETED_COL = false, INCR_ROW_COUNT_COL as a - // no-op (because the metric will be incremented in `mainDataOutput`) and - // CDC_TYPE_COLUMN_NAME = CDC_TYPE_DELETE - val deleteCdcOutput = targetOutputCols :+ FalseLiteral :+ TrueLiteral :+ - Literal(CDC_TYPE_DELETE) - Seq(mainDataOutput, deleteCdcOutput) - } else { - Seq(mainDataOutput) - } - } - exprs.map(resolveOnJoinedPlan) - } - - def notMatchedClauseOutput(clause: DeltaMergeIntoInsertClause): Seq[Seq[Expression]] = { - // Generate insert expressions and set ROW_DELETED_COL = false and - // CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC - val insertExprs = clause.resolvedActions.map(_.expr) - val mainDataOutput = resolveOnJoinedPlan( - if (isDeleteWithDuplicateMatchesAndCdc) { - // Must be delete-when-matched merge with duplicate matches + insert clause - // Therefore we must keep the target row id and source row id. Since this is a not-matched - // clause we know the target row-id will be null. See above at - // isDeleteWithDuplicateMatchesAndCdc definition for more details. - insertExprs :+ - Alias(Literal(null), TARGET_ROW_ID_COL)() :+ UnresolvedAttribute(SOURCE_ROW_ID_COL) :+ - FalseLiteral :+ incrInsertedCountExpr :+ CDC_TYPE_NOT_CDC_LITERAL - } else { - insertExprs :+ FalseLiteral :+ incrInsertedCountExpr :+ CDC_TYPE_NOT_CDC_LITERAL - } - ) - if (cdcEnabled) { - // For insert we have the same expressions as for mainDataOutput, but with - // INCR_ROW_COUNT_COL as a no-op (because the metric will be incremented in - // `mainDataOutput`), and CDC_TYPE_COLUMN_NAME = CDC_TYPE_INSERT - val insertCdcOutput = mainDataOutput.dropRight(2) :+ TrueLiteral :+ Literal(CDC_TYPE_INSERT) - Seq(mainDataOutput, insertCdcOutput) - } else { - Seq(mainDataOutput) - } - } - - def clauseCondition(clause: DeltaMergeIntoClause): Expression = { - // if condition is None, then expression always evaluates to true - val condExpr = clause.condition.getOrElse(TrueLiteral) - resolveOnJoinedPlan(Seq(condExpr)).head - } - - val targetRowHasNoMatch = resolveOnJoinedPlan(Seq(col(SOURCE_ROW_PRESENT_COL).isNull.expr)).head - val sourceRowHasNoMatch = resolveOnJoinedPlan(Seq(col(TARGET_ROW_PRESENT_COL).isNull.expr)).head - val matchedConditions = matchedClauses.map(clauseCondition) - val matchedOutputs = matchedClauses.map(matchedClauseOutput) - val notMatchedConditions = notMatchedClauses.map(clauseCondition) - val notMatchedOutputs = notMatchedClauses.map(notMatchedClauseOutput) - val noopCopyOutput = - resolveOnJoinedPlan(targetOutputCols :+ FalseLiteral :+ incrNoopCountExpr :+ - CDC_TYPE_NOT_CDC_LITERAL) - val deleteRowOutput = - resolveOnJoinedPlan(targetOutputCols :+ TrueLiteral :+ TrueLiteral :+ - CDC_TYPE_NOT_CDC_LITERAL) - var outputDF = addMergeJoinProcessor(spark, joinedPlan, outputRowSchema, - targetRowHasNoMatch = targetRowHasNoMatch, - sourceRowHasNoMatch = sourceRowHasNoMatch, - matchedConditions = matchedConditions, - matchedOutputs = matchedOutputs, - notMatchedConditions = notMatchedConditions, - notMatchedOutputs = notMatchedOutputs, - noopCopyOutput = noopCopyOutput, - deleteRowOutput = deleteRowOutput) - - if (isDeleteWithDuplicateMatchesAndCdc) { - // When we have a delete when matched clause with duplicate matches we have to remove - // duplicate CDC rows. This scenario is further explained at - // isDeleteWithDuplicateMatchesAndCdc definition. - - // To remove duplicate CDC rows generated by the duplicate matches we dedupe by - // TARGET_ROW_ID_COL since there should only be one CDC delete row per target row. - // When there is an insert clause in addition to the delete clause we additionally dedupe by - // SOURCE_ROW_ID_COL and CDC_TYPE_COLUMN_NAME to avoid dropping valid duplicate inserted rows - // and their corresponding CDC rows. - val columnsToDedupeBy = if (notMatchedClauses.nonEmpty) { // insert clause - Seq(TARGET_ROW_ID_COL, SOURCE_ROW_ID_COL, CDC_TYPE_COLUMN_NAME) - } else { - Seq(TARGET_ROW_ID_COL) - } - outputDF = outputDF - .dropDuplicates(columnsToDedupeBy) - .drop(ROW_DROPPED_COL, INCR_ROW_COUNT_COL, TARGET_ROW_ID_COL, SOURCE_ROW_ID_COL) - } else { - outputDF = outputDF.drop(ROW_DROPPED_COL, INCR_ROW_COUNT_COL) - } - - logDebug("writeAllChanges: join output plan:\n" + outputDF.queryExecution) - - // Write to Delta - val newFiles = deltaTxn - .writeFiles(repartitionIfNeeded(spark, outputDF, deltaTxn.metadata.partitionColumns)) - - // Update metrics - val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) - metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) - metrics("numTargetChangeFilesAdded") += newFiles.count(_.isInstanceOf[AddCDCFile]) - metrics("numTargetChangeFileBytes") += newFiles.collect{ case f: AddCDCFile => f.size }.sum - metrics("numTargetBytesAdded") += addedBytes - metrics("numTargetPartitionsAddedTo") += addedPartitions - if (multipleMatchDeleteOnlyOvercount.isDefined) { - // Compensate for counting duplicates during the query. - val actualRowsDeleted = - metrics("numTargetRowsDeleted").value - multipleMatchDeleteOnlyOvercount.get - assert(actualRowsDeleted >= 0) - metrics("numTargetRowsDeleted").set(actualRowsDeleted) - } - - newFiles - } - - private def addMergeJoinProcessor( - spark: SparkSession, - joinedPlan: LogicalPlan, - outputRowSchema: StructType, - targetRowHasNoMatch: Expression, - sourceRowHasNoMatch: Expression, - matchedConditions: Seq[Expression], - matchedOutputs: Seq[Seq[Seq[Expression]]], - notMatchedConditions: Seq[Expression], - notMatchedOutputs: Seq[Seq[Seq[Expression]]], - noopCopyOutput: Seq[Expression], - deleteRowOutput: Seq[Expression]): Dataset[Row] = { - def wrap(e: Expression): BaseExprMeta[Expression] = { - GpuOverrides.wrapExpr(e, rapidsConf, None) - } - - val targetRowHasNoMatchMeta = wrap(targetRowHasNoMatch) - val sourceRowHasNoMatchMeta = wrap(sourceRowHasNoMatch) - val matchedConditionsMetas = matchedConditions.map(wrap) - val matchedOutputsMetas = matchedOutputs.map(_.map(_.map(wrap))) - val notMatchedConditionsMetas = notMatchedConditions.map(wrap) - val notMatchedOutputsMetas = notMatchedOutputs.map(_.map(_.map(wrap))) - val noopCopyOutputMetas = noopCopyOutput.map(wrap) - val deleteRowOutputMetas = deleteRowOutput.map(wrap) - val allMetas = Seq(targetRowHasNoMatchMeta, sourceRowHasNoMatchMeta) ++ - matchedConditionsMetas ++ matchedOutputsMetas.flatten.flatten ++ - notMatchedConditionsMetas ++ notMatchedOutputsMetas.flatten.flatten ++ - noopCopyOutputMetas ++ deleteRowOutputMetas - allMetas.foreach(_.tagForGpu()) - val canReplace = allMetas.forall(_.canExprTreeBeReplaced) && rapidsConf.isOperatorEnabled( - "spark.rapids.sql.exec.RapidsProcessDeltaMergeJoinExec", false, false) - if (rapidsConf.shouldExplainAll || (rapidsConf.shouldExplain && !canReplace)) { - val exprExplains = allMetas.map(_.explain(rapidsConf.shouldExplainAll)) - val execWorkInfo = if (canReplace) { - "will run on GPU" - } else { - "cannot run on GPU because not all merge processing expressions can be replaced" - } - logWarning(s" $execWorkInfo:\n" + - s" ${exprExplains.mkString(" ")}") - } - - if (canReplace) { - val processedJoinPlan = RapidsProcessDeltaMergeJoin( - joinedPlan, - outputRowSchema.toAttributes, - targetRowHasNoMatch = targetRowHasNoMatch, - sourceRowHasNoMatch = sourceRowHasNoMatch, - matchedConditions = matchedConditions, - matchedOutputs = matchedOutputs, - notMatchedConditions = notMatchedConditions, - notMatchedOutputs = notMatchedOutputs, - notMatchedBySourceConditions = Seq.empty, - notMatchedBySourceOutputs = Seq.empty, - noopCopyOutput = noopCopyOutput, - deleteRowOutput = deleteRowOutput) - Dataset.ofRows(spark, processedJoinPlan) - } else { - val joinedRowEncoder = RowEncoder(joinedPlan.schema) - val outputRowEncoder = RowEncoder(outputRowSchema).resolveAndBind() - - val processor = new JoinedRowProcessor( - targetRowHasNoMatch = targetRowHasNoMatch, - sourceRowHasNoMatch = sourceRowHasNoMatch, - matchedConditions = matchedConditions, - matchedOutputs = matchedOutputs, - notMatchedConditions = notMatchedConditions, - notMatchedOutputs = notMatchedOutputs, - noopCopyOutput = noopCopyOutput, - deleteRowOutput = deleteRowOutput, - joinedAttributes = joinedPlan.output, - joinedRowEncoder = joinedRowEncoder, - outputRowEncoder = outputRowEncoder) - - Dataset.ofRows(spark, joinedPlan).mapPartitions(processor.processPartition)(outputRowEncoder) - } - } - - /** - * Build a new logical plan using the given `files` that has the same output columns (exprIds) - * as the `target` logical plan, so that existing update/insert expressions can be applied - * on this new plan. - */ - private def buildTargetPlanWithFiles( - deltaTxn: OptimisticTransaction, - files: Seq[AddFile]): LogicalPlan = { - val targetOutputCols = getTargetOutputCols(deltaTxn) - val targetOutputColsMap = { - val colsMap: Map[String, NamedExpression] = targetOutputCols.view - .map(col => col.name -> col).toMap - if (conf.caseSensitiveAnalysis) { - colsMap - } else { - CaseInsensitiveMap(colsMap) - } - } - - val plan = { - // We have to do surgery to use the attributes from `targetOutputCols` to scan the table. - // In cases of schema evolution, they may not be the same type as the original attributes. - val original = - deltaTxn.deltaLog.createDataFrame(deltaTxn.snapshot, files).queryExecution.analyzed - val transformed = original.transform { - case LogicalRelation(base, _, catalogTbl, isStreaming) => - LogicalRelation( - base, - // We can ignore the new columns which aren't yet AttributeReferences. - targetOutputCols.collect { case a: AttributeReference => a }, - catalogTbl, - isStreaming) - } - - // In case of schema evolution & column mapping, we would also need to rebuild the file format - // because under column mapping, the reference schema within DeltaParquetFileFormat - // that is used to populate metadata needs to be updated - if (deltaTxn.metadata.columnMappingMode != NoMapping) { - val updatedFileFormat = deltaTxn.deltaLog.fileFormat(deltaTxn.metadata) - DeltaTableUtils.replaceFileFormat(transformed, updatedFileFormat) - } else { - transformed - } - } - - // For each plan output column, find the corresponding target output column (by name) and - // create an alias - val aliases = plan.output.map { - case newAttrib: AttributeReference => - val existingTargetAttrib = targetOutputColsMap.get(newAttrib.name) - .getOrElse { - throw new AnalysisException( - s"Could not find ${newAttrib.name} among the existing target output " + - targetOutputCols.mkString(",")) - }.asInstanceOf[AttributeReference] - - if (existingTargetAttrib.exprId == newAttrib.exprId) { - // It's not valid to alias an expression to its own exprId (this is considered a - // non-unique exprId by the analyzer), so we just use the attribute directly. - newAttrib - } else { - Alias(newAttrib, existingTargetAttrib.name)(exprId = existingTargetAttrib.exprId) - } - } - - Project(aliases, plan) - } - - /** Expressions to increment SQL metrics */ - private def makeMetricUpdateUDF(name: String, deterministic: Boolean = false): Expression = { - // only capture the needed metric in a local variable - val metric = metrics(name) - var u = udf(new GpuDeltaMetricUpdateUDF(metric)) - if (!deterministic) { - u = u.asNondeterministic() - } - u.apply().expr - } - - private def getTargetOutputCols(txn: OptimisticTransaction): Seq[NamedExpression] = { - txn.metadata.schema.map { col => - targetOutputAttributesMap - .get(col.name) - .map { a => - AttributeReference(col.name, col.dataType, col.nullable)(a.exprId) - } - .getOrElse(Alias(Literal(null), col.name)() - ) - } - } - - /** - * Repartitions the output DataFrame by the partition columns if table is partitioned - * and `merge.repartitionBeforeWrite.enabled` is set to true. - */ - protected def repartitionIfNeeded( - spark: SparkSession, - df: DataFrame, - partitionColumns: Seq[String]): DataFrame = { - if (partitionColumns.nonEmpty && spark.conf.get(DeltaSQLConf.MERGE_REPARTITION_BEFORE_WRITE)) { - df.repartition(partitionColumns.map(col): _*) - } else { - df - } - } - - /** - * Execute the given `thunk` and return its result while recording the time taken to do it. - * - * @param sqlMetricName name of SQL metric to update with the time taken by the thunk - * @param thunk the code to execute - */ - private def recordMergeOperation[A](sqlMetricName: String)(thunk: => A): A = { - val startTimeNs = System.nanoTime() - val r = thunk - val timeTakenMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs) - if (sqlMetricName != null && timeTakenMs > 0) { - metrics(sqlMetricName) += timeTakenMs - } - r - } -} - -object GpuMergeIntoCommand { - /** - * Spark UI will track all normal accumulators along with Spark tasks to show them on Web UI. - * However, the accumulator used by `MergeIntoCommand` can store a very large value since it - * tracks all files that need to be rewritten. We should ask Spark UI to not remember it, - * otherwise, the UI data may consume lots of memory. Hence, we use the prefix `internal.metrics.` - * to make this accumulator become an internal accumulator, so that it will not be tracked by - * Spark UI. - */ - val TOUCHED_FILES_ACCUM_NAME = "internal.metrics.MergeIntoDelta.touchedFiles" - - val ROW_ID_COL = "_row_id_" - val TARGET_ROW_ID_COL = "_target_row_id_" - val SOURCE_ROW_ID_COL = "_source_row_id_" - val FILE_NAME_COL = "_file_name_" - val SOURCE_ROW_PRESENT_COL = "_source_row_present_" - val TARGET_ROW_PRESENT_COL = "_target_row_present_" - val ROW_DROPPED_COL = GpuDeltaMergeConstants.ROW_DROPPED_COL - val INCR_ROW_COUNT_COL = "_incr_row_count_" - - // Some Delta versions use Literal(null) which translates to a literal of NullType instead - // of the Literal(null, StringType) which is needed, so using a fixed version here - // rather than the version from Delta Lake. - val CDC_TYPE_NOT_CDC_LITERAL = Literal(null, StringType) - - /** - * @param targetRowHasNoMatch whether a joined row is a target row with no match in the source - * table - * @param sourceRowHasNoMatch whether a joined row is a source row with no match in the target - * table - * @param matchedConditions condition for each match clause - * @param matchedOutputs corresponding output for each match clause. for each clause, we - * have 1-3 output rows, each of which is a sequence of expressions - * to apply to the joined row - * @param notMatchedConditions condition for each not-matched clause - * @param notMatchedOutputs corresponding output for each not-matched clause. for each clause, - * we have 1-2 output rows, each of which is a sequence of - * expressions to apply to the joined row - * @param noopCopyOutput no-op expression to copy a target row to the output - * @param deleteRowOutput expression to drop a row from the final output. this is used for - * source rows that don't match any not-matched clauses - * @param joinedAttributes schema of our outer-joined dataframe - * @param joinedRowEncoder joinedDF row encoder - * @param outputRowEncoder final output row encoder - */ - class JoinedRowProcessor( - targetRowHasNoMatch: Expression, - sourceRowHasNoMatch: Expression, - matchedConditions: Seq[Expression], - matchedOutputs: Seq[Seq[Seq[Expression]]], - notMatchedConditions: Seq[Expression], - notMatchedOutputs: Seq[Seq[Seq[Expression]]], - noopCopyOutput: Seq[Expression], - deleteRowOutput: Seq[Expression], - joinedAttributes: Seq[Attribute], - joinedRowEncoder: ExpressionEncoder[Row], - outputRowEncoder: ExpressionEncoder[Row]) extends Serializable { - - private def generateProjection(exprs: Seq[Expression]): UnsafeProjection = { - UnsafeProjection.create(exprs, joinedAttributes) - } - - private def generatePredicate(expr: Expression): BasePredicate = { - GeneratePredicate.generate(expr, joinedAttributes) - } - - def processPartition(rowIterator: Iterator[Row]): Iterator[Row] = { - - val targetRowHasNoMatchPred = generatePredicate(targetRowHasNoMatch) - val sourceRowHasNoMatchPred = generatePredicate(sourceRowHasNoMatch) - val matchedPreds = matchedConditions.map(generatePredicate) - val matchedProjs = matchedOutputs.map(_.map(generateProjection)) - val notMatchedPreds = notMatchedConditions.map(generatePredicate) - val notMatchedProjs = notMatchedOutputs.map(_.map(generateProjection)) - val noopCopyProj = generateProjection(noopCopyOutput) - val deleteRowProj = generateProjection(deleteRowOutput) - val outputProj = UnsafeProjection.create(outputRowEncoder.schema) - - // this is accessing ROW_DROPPED_COL. If ROW_DROPPED_COL is not in outputRowEncoder.schema - // then CDC must be disabled and it's the column after our output cols - def shouldDeleteRow(row: InternalRow): Boolean = { - row.getBoolean( - outputRowEncoder.schema.getFieldIndex(ROW_DROPPED_COL) - .getOrElse(outputRowEncoder.schema.fields.size) - ) - } - - def processRow(inputRow: InternalRow): Iterator[InternalRow] = { - if (targetRowHasNoMatchPred.eval(inputRow)) { - // Target row did not match any source row, so just copy it to the output - Iterator(noopCopyProj.apply(inputRow)) - } else { - // identify which set of clauses to execute: matched or not-matched ones - val (predicates, projections, noopAction) = if (sourceRowHasNoMatchPred.eval(inputRow)) { - // Source row did not match with any target row, so insert the new source row - (notMatchedPreds, notMatchedProjs, deleteRowProj) - } else { - // Source row matched with target row, so update the target row - (matchedPreds, matchedProjs, noopCopyProj) - } - - // find (predicate, projection) pair whose predicate satisfies inputRow - val pair = (predicates zip projections).find { - case (predicate, _) => predicate.eval(inputRow) - } - - pair match { - case Some((_, projections)) => - projections.map(_.apply(inputRow)).iterator - case None => Iterator(noopAction.apply(inputRow)) - } - } - } - - val toRow = joinedRowEncoder.createSerializer() - val fromRow = outputRowEncoder.createDeserializer() - rowIterator - .map(toRow) - .flatMap(processRow) - .filter(!shouldDeleteRow(_)) - .map { notDeletedInternalRow => - fromRow(outputProj(notDeletedInternalRow)) - } - } - } - - /** Count the number of distinct partition values among the AddFiles in the given set. */ - def totalBytesAndDistinctPartitionValues(files: Seq[FileAction]): (Long, Int) = { - val distinctValues = new mutable.HashSet[Map[String, String]]() - var bytes = 0L - val iter = files.collect { case a: AddFile => a }.iterator - while (iter.hasNext) { - val file = iter.next() - distinctValues += file.partitionValues - bytes += file.size - } - // If the only distinct value map is an empty map, then it must be an unpartitioned table. - // Return 0 in that case. - val numDistinctValues = - if (distinctValues.size == 1 && distinctValues.head.isEmpty) 0 else distinctValues.size - (bytes, numDistinctValues) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala deleted file mode 100644 index 03f3592af5a..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala - * in the Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.databricks.sql.transaction.tahoe.rapids - -import java.net.URI - -import scala.collection.mutable.ListBuffer - -import com.databricks.sql.transaction.tahoe._ -import com.databricks.sql.transaction.tahoe.actions.FileAction -import com.databricks.sql.transaction.tahoe.commands.cdc.CDCReader -import com.databricks.sql.transaction.tahoe.constraints.{Constraint, Constraints} -import com.databricks.sql.transaction.tahoe.schema.InvariantViolationException -import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf -import com.nvidia.spark.rapids._ -import com.nvidia.spark.rapids.delta._ -import com.nvidia.spark.rapids.shims.ParquetFieldIdShims -import org.apache.commons.lang3.exception.ExceptionUtils -import org.apache.hadoop.fs.Path - -import org.apache.spark.SparkException -import org.apache.spark.sql.{DataFrame, Dataset} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.plans.logical.LocalRelation -import org.apache.spark.sql.execution.SQLExecution -import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormatWriter} -import org.apache.spark.sql.functions.{col, to_json} -import org.apache.spark.sql.rapids.{BasicColumnarWriteJobStatsTracker, ColumnarWriteJobStatsTracker, GpuFileFormatWriter, GpuWriteJobStatsTracker} -import org.apache.spark.sql.rapids.delta.GpuIdentityColumn -import org.apache.spark.sql.types._ -import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.util.{Clock, SerializableConfiguration} - - -class GpuOptimisticTransaction( - deltaLog: DeltaLog, - snapshot: Snapshot, - rapidsConf: RapidsConf)(implicit clock: Clock) - extends GpuOptimisticTransactionBase(deltaLog, snapshot, rapidsConf)(clock) { - - /** Creates a new OptimisticTransaction. - * - * @param deltaLog The Delta Log for the table this transaction is modifying. - * @param rapidsConf RAPIDS Accelerator config settings - */ - def this(deltaLog: DeltaLog, rapidsConf: RapidsConf)(implicit clock: Clock) = { - this(deltaLog, deltaLog.update(), rapidsConf) - } - - /** - * Returns a tuple of (data, partition schema). For CDC writes, a `__is_cdc` column is added to - * the data and `__is_cdc=true/false` is added to the front of the partition schema. - */ - def performCDCPartition(inputData: Dataset[_]): (DataFrame, StructType) = { - // If this is a CDC write, we need to generate the CDC_PARTITION_COL in order to properly - // dispatch rows between the main table and CDC event records. This is a virtual partition - // and will be stripped out later in [[DelayedCommitProtocolEdge]]. - // Note that the ordering of the partition schema is relevant - CDC_PARTITION_COL must - // come first in order to ensure CDC data lands in the right place. - if (CDCReader.isCDCEnabledOnTable(metadata) && - inputData.schema.fieldNames.contains(CDCReader.CDC_TYPE_COLUMN_NAME)) { - val augmentedData = inputData.withColumn( - CDCReader.CDC_PARTITION_COL, col(CDCReader.CDC_TYPE_COLUMN_NAME).isNotNull) - val partitionSchema = StructType( - StructField(CDCReader.CDC_PARTITION_COL, StringType) +: metadata.physicalPartitionSchema) - (augmentedData, partitionSchema) - } else { - (inputData.toDF(), metadata.physicalPartitionSchema) - } - } - - override def writeFiles( - inputData: Dataset[_], - writeOptions: Option[DeltaOptions], - additionalConstraints: Seq[Constraint]): Seq[FileAction] = { - hasWritten = true - - val spark = inputData.sparkSession - val (data, partitionSchema) = performCDCPartition(inputData) - val outputPath = deltaLog.dataPath - - val (normalizedQueryExecution, output, generatedColumnConstraints, dataHighWaterMarks) = - normalizeData(deltaLog, data) - val highWaterMarks = trackHighWaterMarks.getOrElse(dataHighWaterMarks) - - // Build a new plan with a stub GpuDeltaWrite node to work around undesired transitions between - // columns and rows when AQE is involved. Without this node in the plan, AdaptiveSparkPlanExec - // could be the root node of the plan. In that case we do not have enough context to know - // whether the AdaptiveSparkPlanExec should be columnar or not, since the GPU overrides do not - // see how the parent is using the AdaptiveSparkPlanExec outputs. By using this stub node that - // appears to be a data writing node to AQE (it derives from V2CommandExec), the - // AdaptiveSparkPlanExec will be planned as a child of this new node. That provides enough - // context to plan the AQE sub-plan properly with respect to columnar and row transitions. - // We could force the AQE node to be columnar here by explicitly replacing the node, but that - // breaks the connection between the queryExecution and the node that will actually execute. - val gpuWritePlan = Dataset.ofRows(spark, RapidsDeltaWrite(normalizedQueryExecution.logical)) - val queryExecution = gpuWritePlan.queryExecution - - val partitioningColumns = getPartitioningColumns(partitionSchema, output) - - val committer = getCommitter(outputPath) - - val partitionColNames = partitionSchema.map(_.name).toSet - - // schema should be normalized, therefore we can do an equality check - val statsDataSchema = output.filterNot(c => partitionColNames.contains(c.name)) - - // If Statistics Collection is enabled, then create a stats tracker that will be injected during - // the FileFormatWriter.write call below and will collect per-file stats using - // StatisticsCollection - val optionalStatsTracker = - if (spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_COLLECT_STATS)) { - val indexedCols = DeltaConfigs.DATA_SKIPPING_NUM_INDEXED_COLS.fromMetaData(metadata) - val prefixLength = - spark.sessionState.conf.getConf(DeltaSQLConf.DATA_SKIPPING_STRING_PREFIX_LENGTH) - - val _spark = spark - - val statsCollection = new GpuStatisticsCollection { - override val spark = _spark - override val deletionVectorsSupported = false - override val tableDataSchema: StructType = statsDataSchema.toStructType - override val dataSchema: StructType = tableDataSchema - override val numIndexedCols: Int = indexedCols - override val stringPrefixLength: Int = prefixLength - } - - val statsColExpr: Expression = { - val dummyDF = Dataset.ofRows(spark, LocalRelation(statsDataSchema)) - dummyDF.select(to_json(statsCollection.statsCollector)) - .queryExecution.analyzed.expressions.head - } - - val statsSchema = statsCollection.statCollectionSchema - val explodedDataSchema = statsCollection.explodedDataSchema - val batchStatsToRow = (batch: ColumnarBatch, row: InternalRow) => { - GpuStatisticsCollection.batchStatsToRow(statsSchema, explodedDataSchema, batch, row) - } - Some(new GpuDeltaJobStatisticsTracker(statsDataSchema, statsColExpr, batchStatsToRow)) - } else { - None - } - - val identityTracker = GpuIdentityColumn.createIdentityColumnStatsTracker( - spark, - statsDataSchema, - metadata.schema, - highWaterMarks) - - val constraints = - Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints - - val isOptimize = isOptimizeCommand(queryExecution.analyzed) - - SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { - val outputSpec = FileFormatWriter.OutputSpec( - outputPath.toString, - Map.empty, - output) - - // Remove any unnecessary row conversions added as part of Spark planning - val queryPhysicalPlan = queryExecution.executedPlan match { - case GpuColumnarToRowExec(child, _) => child - case p => p - } - val gpuRapidsWrite = queryPhysicalPlan match { - case g: GpuRapidsDeltaWriteExec => Some(g) - case _ => None - } - - val empty2NullPlan = convertEmptyToNullIfNeeded(queryPhysicalPlan, - partitioningColumns, constraints) - val optimizedPlan = - applyOptimizeWriteIfNeeded(spark, empty2NullPlan, partitionSchema, isOptimize) - val planWithInvariants = addInvariantChecks(optimizedPlan, constraints) - val physicalPlan = convertToGpu(planWithInvariants) - - val statsTrackers: ListBuffer[ColumnarWriteJobStatsTracker] = ListBuffer() - - val hadoopConf = spark.sessionState.newHadoopConfWithOptions( - metadata.configuration ++ deltaLog.options) - if (metadata.columnMappingMode == IdMapping) { - // Need Parquet field IDs when doing column ID mapping - ParquetFieldIdShims.setWriteIdOverride(hadoopConf, true) - } - - if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { - val serializableHadoopConf = new SerializableConfiguration(hadoopConf) - val basicWriteJobStatsTracker = new BasicColumnarWriteJobStatsTracker( - serializableHadoopConf, - BasicWriteJobStatsTracker.metrics) - registerSQLMetrics(spark, basicWriteJobStatsTracker.driverSideMetrics) - statsTrackers.append(basicWriteJobStatsTracker) - gpuRapidsWrite.foreach { grw => - val tracker = new GpuWriteJobStatsTracker(serializableHadoopConf, - grw.basicMetrics, grw.taskMetrics) - statsTrackers.append(tracker) - } - } - - val options = writeOptions match { - case None => Map.empty[String, String] - case Some(writeOptions) => writeOptions.options - } - - val deltaFileFormat = deltaLog.fileFormat(metadata) - val gpuFileFormat = if (deltaFileFormat.getClass == classOf[DeltaParquetFileFormat]) { - new GpuParquetFileFormat - } else { - throw new IllegalStateException(s"file format $deltaFileFormat is not supported") - } - - try { - logDebug(s"Physical plan for write:\n$physicalPlan") - GpuFileFormatWriter.write( - sparkSession = spark, - plan = physicalPlan, - fileFormat = gpuFileFormat, - committer = committer, - outputSpec = outputSpec, - hadoopConf = hadoopConf, - partitionColumns = partitioningColumns, - bucketSpec = None, - statsTrackers = optionalStatsTracker.toSeq ++ identityTracker.toSeq ++ statsTrackers, - options = options, - rapidsConf.stableSort, - rapidsConf.concurrentWriterPartitionFlushSize) - } catch { - case s: SparkException => - // Pull an InvariantViolationException up to the top level if it was the root cause. - val violationException = ExceptionUtils.getRootCause(s) - if (violationException.isInstanceOf[InvariantViolationException]) { - throw violationException - } else { - throw s - } - } - } - - val resultFiles = committer.addedStatuses.map { a => - a.copy(stats = optionalStatsTracker.map( - _.recordedStats(new Path(new URI(a.path)).getName)).getOrElse(a.stats)) - } - - identityTracker.foreach { tracker => - updatedIdentityHighWaterMarks.appendAll(tracker.highWaterMarks.toSeq) - } - val fileActions = resultFiles.toSeq ++ committer.changeFiles - - // Check if auto-compaction is enabled. - // (Auto compaction checks are derived from the work in - // https://github.com/delta-io/delta/pull/1156). - lazy val autoCompactEnabled = - spark.sessionState.conf - .getConf[String](DeltaSQLConf.DELTA_AUTO_COMPACT_ENABLED) - .getOrElse { - DeltaConfigs.AUTO_COMPACT.fromMetaData(metadata) - .getOrElse("false") - }.toBoolean - - if (!isOptimize && autoCompactEnabled && fileActions.nonEmpty) { - registerPostCommitHook(GpuDoAutoCompaction) - } - - fileActions - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala deleted file mode 100644 index 04ceb52def3..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * This file was derived from: - * 1. DoAutoCompaction.scala from PR#1156 at https://github.com/delta-io/delta/pull/1156, - * 2. OptimizeTableCommand.scala from the Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.databricks.sql.transaction.tahoe.rapids - -import java.util.ConcurrentModificationException - -import scala.annotation.tailrec -import scala.collection.mutable.ArrayBuffer - -import com.databricks.sql.transaction.tahoe._ -import com.databricks.sql.transaction.tahoe.DeltaOperations.Operation -import com.databricks.sql.transaction.tahoe.actions.{Action, AddFile, FileAction, RemoveFile} -import com.databricks.sql.transaction.tahoe.commands.DeltaCommand -import com.databricks.sql.transaction.tahoe.commands.optimize._ -import com.databricks.sql.transaction.tahoe.files.SQLMetricsReporting -import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf -import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf - -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext.SPARK_JOB_GROUP_ID -import org.apache.spark.sql.{Row, SparkSession} -import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric -import org.apache.spark.util.ThreadUtils - -class GpuOptimizeExecutor( - sparkSession: SparkSession, - txn: OptimisticTransaction, - partitionPredicate: Seq[Expression], - zOrderByColumns: Seq[String], - prevCommitActions: Seq[Action]) - extends DeltaCommand with SQLMetricsReporting with Serializable { - - /** Timestamp to use in [[FileAction]] */ - private val operationTimestamp = System.currentTimeMillis - - private val isMultiDimClustering = zOrderByColumns.nonEmpty - private val isAutoCompact = prevCommitActions.nonEmpty - private val optimizeType = GpuOptimizeType(isMultiDimClustering, isAutoCompact) - - def optimize(): Seq[Row] = { - recordDeltaOperation(txn.deltaLog, "delta.optimize") { - val maxFileSize = optimizeType.maxFileSize - require(maxFileSize > 0, "maxFileSize must be > 0") - - val minNumFilesInDir = optimizeType.minNumFiles - val (candidateFiles, filesToProcess) = optimizeType.targetFiles - val partitionSchema = txn.metadata.partitionSchema - - // select all files in case of multi-dimensional clustering - val partitionsToCompact = filesToProcess - .groupBy(_.partitionValues) - .filter { case (_, filesInPartition) => filesInPartition.size >= minNumFilesInDir } - .toSeq - - val groupedJobs = groupFilesIntoBins(partitionsToCompact, maxFileSize) - val jobs = optimizeType.targetBins(groupedJobs) - - val maxThreads = - sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS) - val updates = ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { partitionBinGroup => - runOptimizeBinJob(txn, partitionBinGroup._1, partitionBinGroup._2, maxFileSize) - }.flatten - - val addedFiles = updates.collect { case a: AddFile => a } - val removedFiles = updates.collect { case r: RemoveFile => r } - if (addedFiles.nonEmpty) { - val operation = DeltaOperations.Optimize(partitionPredicate, zOrderByColumns) - val metrics = createMetrics(sparkSession.sparkContext, addedFiles, removedFiles) - commitAndRetry(txn, operation, updates, metrics) { newTxn => - val newPartitionSchema = newTxn.metadata.partitionSchema - val candidateSetOld = candidateFiles.map(_.path).toSet - val candidateSetNew = newTxn.filterFiles(partitionPredicate).map(_.path).toSet - - // As long as all of the files that we compacted are still part of the table, - // and the partitioning has not changed it is valid to continue to try - // and commit this checkpoint. - if (candidateSetOld.subsetOf(candidateSetNew) && partitionSchema == newPartitionSchema) { - true - } else { - val deleted = candidateSetOld -- candidateSetNew - logWarning(s"The following compacted files were delete " + - s"during checkpoint ${deleted.mkString(",")}. Aborting the compaction.") - false - } - } - } - - val optimizeStats = OptimizeStats() - optimizeStats.addedFilesSizeStats.merge(addedFiles) - optimizeStats.removedFilesSizeStats.merge(removedFiles) - optimizeStats.numPartitionsOptimized = jobs.map(j => j._1).distinct.size - optimizeStats.numBatches = jobs.size - optimizeStats.totalConsideredFiles = candidateFiles.size - optimizeStats.totalFilesSkipped = optimizeStats.totalConsideredFiles - removedFiles.size - - if (isMultiDimClustering) { - val inputFileStats = - ZOrderFileStats(removedFiles.size, removedFiles.map(_.size.getOrElse(0L)).sum) - optimizeStats.zOrderStats = Some(ZOrderStats( - strategyName = "all", // means process all files in a partition - inputCubeFiles = ZOrderFileStats(0, 0), - inputOtherFiles = inputFileStats, - inputNumCubes = 0, - mergedFiles = inputFileStats, - // There will one z-cube for each partition - numOutputCubes = optimizeStats.numPartitionsOptimized)) - } - - return Seq(Row(txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics)) - } - } - - /** - * Utility methods to group files into bins for optimize. - * - * @param partitionsToCompact List of files to compact group by partition. - * Partition is defined by the partition values (partCol -> partValue) - * @param maxTargetFileSize Max size (in bytes) of the compaction output file. - * @return Sequence of bins. Each bin contains one or more files from the same - * partition and targeted for one output file. - */ - private def groupFilesIntoBins( - partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])], - maxTargetFileSize: Long): Seq[(Map[String, String], Seq[AddFile])] = { - - partitionsToCompact.flatMap { - case (partition, files) => - val bins = new ArrayBuffer[Seq[AddFile]]() - - val currentBin = new ArrayBuffer[AddFile]() - var currentBinSize = 0L - - files.sortBy(_.size).foreach { file => - // Generally, a bin is a group of existing files, whose total size does not exceed the - // desired maxFileSize. They will be coalesced into a single output file. - // However, if isMultiDimClustering = true, all files in a partition will be read by the - // same job, the data will be range-partitioned and numFiles = totalFileSize / maxFileSize - // will be produced. See below. - if (file.size + currentBinSize > maxTargetFileSize && !isMultiDimClustering) { - bins += currentBin.toVector - currentBin.clear() - currentBin += file - currentBinSize = file.size - } else { - currentBin += file - currentBinSize += file.size - } - } - - if (currentBin.nonEmpty) { - bins += currentBin.toVector - } - - bins.map(b => (partition, b)) - // select bins that have at least two files or in case of multi-dim clustering - // select all bins - .filter(_._2.size > 1 || isMultiDimClustering) - } - } - - /** - * Utility method to run a Spark job to compact the files in given bin - * - * @param txn [[OptimisticTransaction]] instance in use to commit the changes to DeltaLog. - * @param partition Partition values of the partition that files in [[bin]] belongs to. - * @param bin List of files to compact into one large file. - * @param maxFileSize Targeted output file size in bytes - */ - private def runOptimizeBinJob( - txn: OptimisticTransaction, - partition: Map[String, String], - bin: Seq[AddFile], - maxFileSize: Long): Seq[FileAction] = { - val baseTablePath = txn.deltaLog.dataPath - - val input = txn.deltaLog.createDataFrame(txn.snapshot, bin, actionTypeOpt = Some("Optimize")) - val repartitionDF = if (isMultiDimClustering) { - // TODO: MultiDimClustering is not currently supported on Databricks 10.4. - // val totalSize = bin.map(_.size).sum - // val approxNumFiles = Math.max(1, totalSize / maxFileSize).toInt - // MultiDimClustering.cluster( - // txn.deltaLog, - // input, - // approxNumFiles, - // zOrderByColumns) - throw new UnsupportedOperationException("MultiDimClustering not supported on compaction") - } else { - // Re-partition is not available in Databricks 10.4 (spark321db) - input.coalesce(numPartitions = 1) - } - - val partitionDesc = partition.toSeq.map(entry => entry._1 + "=" + entry._2).mkString(",") - - val partitionName = if (partition.isEmpty) "" else s" in partition ($partitionDesc)" - val description = s"$baseTablePath
Optimizing ${bin.size} files" + partitionName - sparkSession.sparkContext.setJobGroup( - sparkSession.sparkContext.getLocalProperty(SPARK_JOB_GROUP_ID), - description) - - val addFiles = txn.writeFiles(repartitionDF).collect { - case a: AddFile => - a.copy(dataChange = false) - case other => - throw new IllegalStateException( - s"Unexpected action $other with type ${other.getClass}. File compaction job output" + - s"should only have AddFiles") - } - val removeFiles = bin.map(f => f.removeWithTimestamp(operationTimestamp, dataChange = false)) - val updates = addFiles ++ removeFiles - updates - } - - private type PartitionedBin = (Map[String, String], Seq[AddFile]) - - private trait GpuOptimizeType { - def minNumFiles: Long - - def maxFileSize: Long = - sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE) - - def targetFiles: (Seq[AddFile], Seq[AddFile]) - - def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = jobs - } - - private case class GpuCompaction() extends GpuOptimizeType { - def minNumFiles: Long = 2 - - def targetFiles: (Seq[AddFile], Seq[AddFile]) = { - val minFileSize = sparkSession.sessionState.conf.getConf( - DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE) - require(minFileSize > 0, "minFileSize must be > 0") - val candidateFiles = txn.filterFiles(partitionPredicate) - val filesToProcess = candidateFiles.filter(_.size < minFileSize) - (candidateFiles, filesToProcess) - } - } - - private case class GpuMultiDimOrdering() extends GpuOptimizeType { - def minNumFiles: Long = 1 - - def targetFiles: (Seq[AddFile], Seq[AddFile]) = { - // select all files in case of multi-dimensional clustering - val candidateFiles = txn.filterFiles(partitionPredicate) - (candidateFiles, candidateFiles) - } - } - - private case class GpuAutoCompaction() extends GpuOptimizeType { - def minNumFiles: Long = { - val minNumFiles = - sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES) - require(minNumFiles > 0, "minNumFiles must be > 0") - minNumFiles - } - - override def maxFileSize: Long = - sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE) - .getOrElse(128 * 1024 * 1024) - - override def targetFiles: (Seq[AddFile], Seq[AddFile]) = { - val autoCompactTarget = - sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_TARGET) - // Filter the candidate files according to autoCompact.target config. - lazy val addedFiles = prevCommitActions.collect { case a: AddFile => a } - val candidateFiles = autoCompactTarget match { - case "table" => - txn.filterFiles() - case "commit" => - addedFiles - case "partition" => - val eligiblePartitions = addedFiles.map(_.partitionValues).toSet - txn.filterFiles().filter(f => eligiblePartitions.contains(f.partitionValues)) - case _ => - logError(s"Invalid config for autoCompact.target: $autoCompactTarget. " + - s"Falling back to the default value 'table'.") - txn.filterFiles() - } - val filesToProcess = candidateFiles.filter(_.size < maxFileSize) - (candidateFiles, filesToProcess) - } - - override def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = { - var acc = 0L - val maxCompactBytes = - sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_MAX_COMPACT_BYTES) - // bins with more files are prior to less files. - jobs - .sortBy { case (_, filesInBin) => -filesInBin.length } - .takeWhile { case (_, filesInBin) => - acc += filesInBin.map(_.size).sum - acc <= maxCompactBytes - } - } - } - - private object GpuOptimizeType { - - def apply(isMultiDimClustering: Boolean, isAutoCompact: Boolean): GpuOptimizeType = { - if (isMultiDimClustering) { - GpuMultiDimOrdering() - } else if (isAutoCompact) { - GpuAutoCompaction() - } else { - GpuCompaction() - } - } - } - - /** - * Attempts to commit the given actions to the log. In the case of a concurrent update, - * the given function will be invoked with a new transaction to allow custom conflict - * detection logic to indicate it is safe to try again, by returning `true`. - * - * This function will continue to try to commit to the log as long as `f` returns `true`, - * otherwise throws a subclass of [[ConcurrentModificationException]]. - */ - @tailrec - private def commitAndRetry( - txn: OptimisticTransaction, - optimizeOperation: Operation, - actions: Seq[Action], - metrics: Map[String, SQLMetric])(f: OptimisticTransaction => Boolean) - : Unit = { - try { - txn.registerSQLMetrics(sparkSession, metrics) - txn.commit(actions, optimizeOperation) - } catch { - case e: ConcurrentModificationException => - val newTxn = txn.deltaLog.startTransaction() - if (f(newTxn)) { - logInfo("Retrying commit after checking for semantic conflicts with concurrent updates.") - commitAndRetry(newTxn, optimizeOperation, actions, metrics)(f) - } else { - logWarning("Semantic conflicts detected. Aborting operation.") - throw e - } - } - } - - /** Create a map of SQL metrics for adding to the commit history. */ - private def createMetrics( - sparkContext: SparkContext, - addedFiles: Seq[AddFile], - removedFiles: Seq[RemoveFile]): Map[String, SQLMetric] = { - - def setAndReturnMetric(description: String, value: Long) = { - val metric = createMetric(sparkContext, description) - metric.set(value) - metric - } - - def totalSize(actions: Seq[FileAction]): Long = { - var totalSize = 0L - actions.foreach { file => - val fileSize = file match { - case addFile: AddFile => addFile.size - case removeFile: RemoveFile => removeFile.size.getOrElse(0L) - case default => - throw new IllegalArgumentException(s"Unknown FileAction type: ${default.getClass}") - } - totalSize += fileSize - } - totalSize - } - - val sizeStats = FileSizeStatsWithHistogram.create(addedFiles.map(_.size).sorted) - Map[String, SQLMetric]( - "minFileSize" -> setAndReturnMetric("minimum file size", sizeStats.get.min), - "p25FileSize" -> setAndReturnMetric("25th percentile file size", sizeStats.get.p25), - "p50FileSize" -> setAndReturnMetric("50th percentile file size", sizeStats.get.p50), - "p75FileSize" -> setAndReturnMetric("75th percentile file size", sizeStats.get.p75), - "maxFileSize" -> setAndReturnMetric("maximum file size", sizeStats.get.max), - "numAddedFiles" -> setAndReturnMetric("total number of files added.", addedFiles.size), - "numRemovedFiles" -> setAndReturnMetric("total number of files removed.", removedFiles.size), - "numAddedBytes" -> setAndReturnMetric("total number of bytes added", totalSize(addedFiles)), - "numRemovedBytes" -> - setAndReturnMetric("total number of bytes removed", totalSize(removedFiles))) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuUpdateCommand.scala b/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuUpdateCommand.scala deleted file mode 100644 index 4bcffe8768e..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuUpdateCommand.scala +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * This file was derived from UpdateCommand.scala - * in the Delta Lake project at https://github.com/delta-io/delta. - * - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.databricks.sql.transaction.tahoe.rapids - -import com.databricks.sql.transaction.tahoe.{DeltaConfigs, DeltaLog, DeltaOperations, DeltaTableUtils, OptimisticTransaction} -import com.databricks.sql.transaction.tahoe.actions.{AddCDCFile, AddFile, FileAction} -import com.databricks.sql.transaction.tahoe.commands.{DeltaCommand, UpdateCommandEdge, UpdateMetric} -import com.databricks.sql.transaction.tahoe.commands.cdc.CDCReader.{CDC_TYPE_COLUMN_NAME, CDC_TYPE_NOT_CDC, CDC_TYPE_UPDATE_POSTIMAGE, CDC_TYPE_UPDATE_PREIMAGE} -import com.databricks.sql.transaction.tahoe.files.{TahoeBatchFileIndex, TahoeFileIndex} -import com.nvidia.spark.rapids.delta.GpuDeltaMetricUpdateUDF -import org.apache.hadoop.fs.Path - -import org.apache.spark.SparkContext -import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} -import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, If, Literal} -import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.execution.SQLExecution -import org.apache.spark.sql.execution.command.LeafRunnableCommand -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} -import org.apache.spark.sql.execution.metric.SQLMetrics.{createMetric, createTimingMetric} -import org.apache.spark.sql.functions.{array, col, explode, input_file_name, lit, struct, typedLit, udf} -import org.apache.spark.sql.types.LongType - -case class GpuUpdateCommand( - gpuDeltaLog: GpuDeltaLog, - tahoeFileIndex: TahoeFileIndex, - target: LogicalPlan, - updateExpressions: Seq[Expression], - condition: Option[Expression]) - extends LeafRunnableCommand with DeltaCommand { - - override val output: Seq[Attribute] = { - Seq(AttributeReference("num_affected_rows", LongType)()) - } - - override def innerChildren: Seq[QueryPlan[_]] = Seq(target) - - @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() - - override lazy val metrics = Map[String, SQLMetric]( - "numAddedFiles" -> createMetric(sc, "number of files added."), - "numRemovedFiles" -> createMetric(sc, "number of files removed."), - "numUpdatedRows" -> createMetric(sc, "number of rows updated."), - "numCopiedRows" -> createMetric(sc, "number of rows copied."), - "executionTimeMs" -> - createTimingMetric(sc, "time taken to execute the entire operation"), - "scanTimeMs" -> - createTimingMetric(sc, "time taken to scan the files for matches"), - "rewriteTimeMs" -> - createTimingMetric(sc, "time taken to rewrite the matched files"), - "numAddedChangeFiles" -> createMetric(sc, "number of change data capture files generated"), - "changeFileBytes" -> createMetric(sc, "total size of change data capture files generated"), - "numTouchedRows" -> createMetric(sc, "number of rows touched (copied + updated)") - ) - - final override def run(sparkSession: SparkSession): Seq[Row] = { - recordDeltaOperation(tahoeFileIndex.deltaLog, "delta.dml.update") { - val deltaLog = tahoeFileIndex.deltaLog - deltaLog.assertRemovable() - gpuDeltaLog.withNewTransaction { txn => - performUpdate(sparkSession, deltaLog, txn) - } - // Re-cache all cached plans(including this relation itself, if it's cached) that refer to - // this data source relation. - sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, target) - } - Seq(Row(metrics("numUpdatedRows").value)) - } - - private def performUpdate( - sparkSession: SparkSession, deltaLog: DeltaLog, txn: OptimisticTransaction): Unit = { - import sparkSession.implicits._ - - var numTouchedFiles: Long = 0 - var numRewrittenFiles: Long = 0 - var numAddedChangeFiles: Long = 0 - var changeFileBytes: Long = 0 - var scanTimeMs: Long = 0 - var rewriteTimeMs: Long = 0 - - val startTime = System.nanoTime() - val numFilesTotal = txn.snapshot.numOfFiles - - val updateCondition = condition.getOrElse(Literal.TrueLiteral) - val (metadataPredicates, dataPredicates) = - DeltaTableUtils.splitMetadataAndDataPredicates( - updateCondition, txn.metadata.partitionColumns, sparkSession) - val candidateFiles = txn.filterFiles(metadataPredicates ++ dataPredicates) - val nameToAddFile = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) - - scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - - val filesToRewrite: Seq[AddFile] = if (candidateFiles.isEmpty) { - // Case 1: Do nothing if no row qualifies the partition predicates - // that are part of Update condition - Nil - } else if (dataPredicates.isEmpty) { - // Case 2: Update all the rows from the files that are in the specified partitions - // when the data filter is empty - candidateFiles - } else { - // Case 3: Find all the affected files using the user-specified condition - val fileIndex = new TahoeBatchFileIndex( - sparkSession, "update", candidateFiles, deltaLog, tahoeFileIndex.path, txn.snapshot) - // Keep everything from the resolved target except a new TahoeFileIndex - // that only involves the affected files instead of all files. - val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) - val data = Dataset.ofRows(sparkSession, newTarget) - val updatedRowCount = metrics("numUpdatedRows") - val updatedRowUdf = udf { - new GpuDeltaMetricUpdateUDF(updatedRowCount) - }.asNondeterministic() - val pathsToRewrite = - withStatusCode("DELTA", GpuUpdateCommand.FINDING_TOUCHED_FILES_MSG) { - data.filter(new Column(updateCondition)) - .select(input_file_name()) - .filter(updatedRowUdf()) - .distinct() - .as[String] - .collect() - } - - scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - - pathsToRewrite.map(getTouchedFile(deltaLog.dataPath, _, nameToAddFile)).toSeq - } - - numTouchedFiles = filesToRewrite.length - - val newActions = if (filesToRewrite.isEmpty) { - // Do nothing if no row qualifies the UPDATE condition - Nil - } else { - // Generate the new files containing the updated values - withStatusCode("DELTA", GpuUpdateCommand.rewritingFilesMsg(filesToRewrite.size)) { - rewriteFiles(sparkSession, txn, tahoeFileIndex.path, - filesToRewrite.map(_.path), nameToAddFile, updateCondition) - } - } - - rewriteTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - scanTimeMs - - val (changeActions, addActions) = newActions.partition(_.isInstanceOf[AddCDCFile]) - numRewrittenFiles = addActions.size - numAddedChangeFiles = changeActions.size - changeFileBytes = changeActions.collect { case f: AddCDCFile => f.size }.sum - - val totalActions = if (filesToRewrite.isEmpty) { - // Do nothing if no row qualifies the UPDATE condition - Nil - } else { - // Delete the old files and return those delete actions along with the new AddFile actions for - // files containing the updated values - val operationTimestamp = System.currentTimeMillis() - val deleteActions = filesToRewrite.map(_.removeWithTimestamp(operationTimestamp)) - - deleteActions ++ newActions - } - - if (totalActions.nonEmpty) { - metrics("numAddedFiles").set(numRewrittenFiles) - metrics("numAddedChangeFiles").set(numAddedChangeFiles) - metrics("changeFileBytes").set(changeFileBytes) - metrics("numRemovedFiles").set(numTouchedFiles) - metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) - metrics("scanTimeMs").set(scanTimeMs) - metrics("rewriteTimeMs").set(rewriteTimeMs) - // In the case where the numUpdatedRows is not captured, we can siphon out the metrics from - // the BasicWriteStatsTracker. This is for case 2 where the update condition contains only - // metadata predicates and so the entire partition is re-written. - val outputRows = txn.getMetric("numOutputRows").map(_.value).getOrElse(-1L) - if (metrics("numUpdatedRows").value == 0 && outputRows != 0 && - metrics("numCopiedRows").value == 0) { - // We know that numTouchedRows = numCopiedRows + numUpdatedRows. - // Since an entire partition was re-written, no rows were copied. - // So numTouchedRows == numUpdateRows - metrics("numUpdatedRows").set(metrics("numTouchedRows").value) - } else { - // This is for case 3 where the update condition contains both metadata and data predicates - // so relevant files will have some rows updated and some rows copied. We don't need to - // consider case 1 here, where no files match the update condition, as we know that - // `totalActions` is empty. - metrics("numCopiedRows").set( - metrics("numTouchedRows").value - metrics("numUpdatedRows").value) - } - txn.registerSQLMetrics(sparkSession, metrics) - txn.commit(totalActions, DeltaOperations.Update(condition)) - // This is needed to make the SQL metrics visible in the Spark UI - val executionId = sparkSession.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - SQLMetrics.postDriverMetricUpdates( - sparkSession.sparkContext, executionId, metrics.values.toSeq) - } - - recordDeltaEvent( - deltaLog, - "delta.dml.update.stats", - data = UpdateMetric( - condition = condition.map(_.sql).getOrElse("true"), - numFilesTotal, - numTouchedFiles, - numRewrittenFiles, - numAddedChangeFiles, - changeFileBytes, - scanTimeMs, - rewriteTimeMs) - ) - } - - /** - * Scan all the affected files and write out the updated files. - * - * When CDF is enabled, includes the generation of CDC preimage and postimage columns for - * changed rows. - * - * @return the list of [[AddFile]]s and [[AddCDCFile]]s that have been written. - */ - private def rewriteFiles( - spark: SparkSession, - txn: OptimisticTransaction, - rootPath: Path, - inputLeafFiles: Seq[String], - nameToAddFileMap: Map[String, AddFile], - condition: Expression): Seq[FileAction] = { - // Containing the map from the relative file path to AddFile - val baseRelation = buildBaseRelation( - spark, txn, "update", rootPath, inputLeafFiles, nameToAddFileMap) - val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) - val targetDf = Dataset.ofRows(spark, newTarget) - - // Number of total rows that we have seen, i.e. are either copying or updating (sum of both). - // This will be used later, along with numUpdatedRows, to determine numCopiedRows. - val numTouchedRows = metrics("numTouchedRows") - val numTouchedRowsUdf = udf { - new GpuDeltaMetricUpdateUDF(numTouchedRows) - }.asNondeterministic() - - val updatedDataFrame = GpuUpdateCommand.withUpdatedColumns( - target, - updateExpressions, - condition, - targetDf - .filter(numTouchedRowsUdf()) - .withColumn(GpuUpdateCommand.CONDITION_COLUMN_NAME, new Column(condition)), - GpuUpdateCommand.shouldOutputCdc(txn)) - - txn.writeFiles(updatedDataFrame) - } -} - -object GpuUpdateCommand { - val CONDITION_COLUMN_NAME = UpdateCommandEdge.CONDITION_COLUMN_NAME - val FINDING_TOUCHED_FILES_MSG: String = "Finding files to rewrite for UPDATE operation" - - def rewritingFilesMsg(numFilesToRewrite: Long): String = - s"Rewriting $numFilesToRewrite files for UPDATE operation" - - /** - * Whether or not CDC is enabled on this table and, thus, if we should output CDC data during this - * UPDATE operation. - */ - def shouldOutputCdc(txn: OptimisticTransaction): Boolean = { - DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(txn.metadata) - } - - /** - * Build the new columns. If the condition matches, generate the new value using - * the corresponding UPDATE EXPRESSION; otherwise, keep the original column value. - * - * When CDC is enabled, includes the generation of CDC pre-image and post-image columns for - * changed rows. - * - * @param target target we are updating into - * @param updateExpressions the update transformation to perform on the input DataFrame - * @param dfWithEvaluatedCondition source DataFrame on which we will apply the update expressions - * with an additional column CONDITION_COLUMN_NAME which is the - * true/false value of if the update condition is satisfied - * @param condition update condition - * @param shouldOutputCdc if we should output CDC data during this UPDATE operation. - * @return the updated DataFrame, with extra CDC columns if CDC is enabled - */ - def withUpdatedColumns( - target: LogicalPlan, - updateExpressions: Seq[Expression], - condition: Expression, - dfWithEvaluatedCondition: DataFrame, - shouldOutputCdc: Boolean): DataFrame = { - val resultDf = if (shouldOutputCdc) { - val namedUpdateCols = updateExpressions.zip(target.output).map { - case (expr, targetCol) => new Column(expr).as(targetCol.name) - } - - // Build an array of output rows to be unpacked later. If the condition is matched, we - // generate CDC pre and postimages in addition to the final output row; if the condition - // isn't matched, we just generate a rewritten no-op row without any CDC events. - val preimageCols = target.output.map(new Column(_)) :+ - lit(CDC_TYPE_UPDATE_PREIMAGE).as(CDC_TYPE_COLUMN_NAME) - val postimageCols = namedUpdateCols :+ - lit(CDC_TYPE_UPDATE_POSTIMAGE).as(CDC_TYPE_COLUMN_NAME) - val updatedDataCols = namedUpdateCols :+ - typedLit[String](CDC_TYPE_NOT_CDC).as(CDC_TYPE_COLUMN_NAME) - val noopRewriteCols = target.output.map(new Column(_)) :+ - typedLit[String](CDC_TYPE_NOT_CDC).as(CDC_TYPE_COLUMN_NAME) - val packedUpdates = array( - struct(preimageCols: _*), - struct(postimageCols: _*), - struct(updatedDataCols: _*) - ).expr - - val packedData = if (condition == Literal.TrueLiteral) { - packedUpdates - } else { - If( - UnresolvedAttribute(CONDITION_COLUMN_NAME), - packedUpdates, // if it should be updated, then use `packagedUpdates` - array(struct(noopRewriteCols: _*)).expr) // else, this is a noop rewrite - } - - // Explode the packed array, and project back out the final data columns. - val finalColNames = target.output.map(_.name) :+ CDC_TYPE_COLUMN_NAME - dfWithEvaluatedCondition - .select(explode(new Column(packedData)).as("packedData")) - .select(finalColNames.map { n => col(s"packedData.`$n`").as(s"$n") }: _*) - } else { - val finalCols = updateExpressions.zip(target.output).map { case (update, original) => - val updated = if (condition == Literal.TrueLiteral) { - update - } else { - If(UnresolvedAttribute(CONDITION_COLUMN_NAME), update, original) - } - new Column(Alias(updated, original.name)()) - } - - dfWithEvaluatedCondition.select(finalCols: _*) - } - - resultDf.drop(CONDITION_COLUMN_NAME) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProbe.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProbe.scala deleted file mode 100644 index 2194522ab82..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProbe.scala +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids.delta - -/** - * Implements the Delta Probe interface for probing the Delta Lake provider on Databricks. - * @note This is instantiated via reflection from ShimLoader. - */ -class DeltaProbeImpl extends DeltaProbe { - // Delta Lake is built-in for Databricks instances, so no probing is necessary. - override def getDeltaProvider: DeltaProvider = DeltaSpark321DBProvider -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/DeltaSpark321DBProvider.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/DeltaSpark321DBProvider.scala deleted file mode 100644 index 44e5721bafc..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/DeltaSpark321DBProvider.scala +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.delta - -import com.databricks.sql.transaction.tahoe.rapids.GpuDeltaCatalog -import com.nvidia.spark.rapids.{AtomicCreateTableAsSelectExecMeta, AtomicReplaceTableAsSelectExecMeta, GpuExec} - -import org.apache.spark.sql.execution.datasources.v2.{AtomicCreateTableAsSelectExec, AtomicReplaceTableAsSelectExec} -import org.apache.spark.sql.execution.datasources.v2.rapids.{GpuAtomicCreateTableAsSelectExec, GpuAtomicReplaceTableAsSelectExec} - -object DeltaSpark321DBProvider extends DatabricksDeltaProviderBase { - - override def convertToGpu( - cpuExec: AtomicCreateTableAsSelectExec, - meta: AtomicCreateTableAsSelectExecMeta): GpuExec = { - GpuAtomicCreateTableAsSelectExec( - cpuExec.output, - new GpuDeltaCatalog(cpuExec.catalog, meta.conf), - cpuExec.ident, - cpuExec.partitioning, - cpuExec.plan, - meta.childPlans.head.convertIfNeeded(), - cpuExec.tableSpec, - cpuExec.writeOptions, - cpuExec.ifNotExists) - } - - override def convertToGpu( - cpuExec: AtomicReplaceTableAsSelectExec, - meta: AtomicReplaceTableAsSelectExecMeta): GpuExec = { - GpuAtomicReplaceTableAsSelectExec( - cpuExec.output, - new GpuDeltaCatalog(cpuExec.catalog, meta.conf), - cpuExec.ident, - cpuExec.partitioning, - cpuExec.plan, - meta.childPlans.head.convertIfNeeded(), - cpuExec.tableSpec, - cpuExec.writeOptions, - cpuExec.orCreate, - cpuExec.invalidateCache) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala deleted file mode 100644 index 773da5eb0bf..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.delta - -import com.databricks.sql.transaction.tahoe.{DeltaColumnMappingMode, DeltaParquetFileFormat} -import com.nvidia.spark.rapids.SparkPlanMeta - -import org.apache.spark.sql.execution.FileSourceScanExec -import org.apache.spark.sql.types.StructType - -case class GpuDeltaParquetFileFormat( - override val columnMappingMode: DeltaColumnMappingMode, - override val referenceSchema: StructType) extends GpuDeltaParquetFileFormatBase { -} - -object GpuDeltaParquetFileFormat { - def tagSupportForGpuFileSourceScan(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {} - - def convertToGpu(format: DeltaParquetFileFormat): GpuDeltaParquetFileFormat = { - GpuDeltaParquetFileFormat(format.columnMappingMode, format.referenceSchema) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeleteCommandMetaShim.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeleteCommandMetaShim.scala deleted file mode 100644 index bf401417551..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeleteCommandMetaShim.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.delta.shims - -import com.nvidia.spark.rapids.delta.{DeleteCommandEdgeMeta, DeleteCommandMeta} - -object DeleteCommandMetaShim { - def tagForGpu(meta: DeleteCommandMeta): Unit = {} - def tagForGpu(meta: DeleteCommandEdgeMeta): Unit = {} -} \ No newline at end of file diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/InvariantViolationExceptionShim.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/InvariantViolationExceptionShim.scala deleted file mode 100644 index c6c9001c4a7..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/InvariantViolationExceptionShim.scala +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids.delta.shims - -import com.databricks.sql.transaction.tahoe.constraints.Constraints._ -import com.databricks.sql.transaction.tahoe.schema.InvariantViolationException - -object InvariantViolationExceptionShim { - def apply(c: Check, m: Map[String, Any]): InvariantViolationException = { - InvariantViolationException(c, m) - } - - def apply(c: NotNull): InvariantViolationException = { - InvariantViolationException(c) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/MergeIntoCommandMetaShim.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/MergeIntoCommandMetaShim.scala deleted file mode 100644 index c6be44db37b..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/MergeIntoCommandMetaShim.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.delta.shims - -import com.databricks.sql.transaction.tahoe.commands.{MergeIntoCommand, MergeIntoCommandEdge} -import com.databricks.sql.transaction.tahoe.rapids.{GpuDeltaLog, GpuMergeIntoCommand} -import com.nvidia.spark.rapids.RapidsConf -import com.nvidia.spark.rapids.delta.{MergeIntoCommandEdgeMeta, MergeIntoCommandMeta} - -import org.apache.spark.sql.execution.command.RunnableCommand - -object MergeIntoCommandMetaShim { - def tagForGpu(meta: MergeIntoCommandMeta, mergeCmd: MergeIntoCommand): Unit = {} - def tagForGpu(meta: MergeIntoCommandEdgeMeta, mergeCmd: MergeIntoCommandEdge): Unit = {} - - def convertToGpu(mergeCmd: MergeIntoCommand, conf: RapidsConf): RunnableCommand = { - GpuMergeIntoCommand( - mergeCmd.source, - mergeCmd.target, - new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), - mergeCmd.condition, - mergeCmd.matchedClauses, - mergeCmd.notMatchedClauses, - mergeCmd.migratedSchema)(conf) - } - - def convertToGpu(mergeCmd: MergeIntoCommandEdge, conf: RapidsConf): RunnableCommand = { - GpuMergeIntoCommand( - mergeCmd.source, - mergeCmd.target, - new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), - mergeCmd.condition, - mergeCmd.matchedClauses, - mergeCmd.notMatchedClauses, - mergeCmd.migratedSchema)(conf) - } -} diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/MetadataShims.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/MetadataShims.scala deleted file mode 100644 index f722837778a..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/MetadataShims.scala +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.nvidia.spark.rapids.delta.shims - -import com.databricks.sql.transaction.tahoe.stats.UsesMetadataFields - -trait ShimUsesMetadataFields extends UsesMetadataFields diff --git a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/ShimDeltaUDF.scala b/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/ShimDeltaUDF.scala deleted file mode 100644 index 19c74616ee4..00000000000 --- a/delta-lake/delta-spark321db/src/main/scala/com/nvidia/spark/rapids/delta/shims/ShimDeltaUDF.scala +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.delta.shims - -import com.databricks.sql.transaction.tahoe.DeltaUDF - -import org.apache.spark.sql.expressions.UserDefinedFunction - -object ShimDeltaUDF { - def stringStringUdf(f: String => String): UserDefinedFunction = DeltaUDF.stringStringUdf(f) -} diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index f7b77f6f83b..c0a1135b629 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -1,6 +1,6 @@ - release321db - - - buildver - 321db - - - - 321db - - 3.4.4 - spark321db - ${spark321db.version} - ${spark321db.version} - 3.3.1 - true - 1.12.0 - - - shim-deps/databricks - delta-lake/delta-spark321db - - release330 @@ -722,8 +698,8 @@ spark${buildver} cuda11 ${cuda.version} - 24.02.1 - 24.02.1 + 24.04.0 + 24.04.1 2.12 2.8.0 incremental @@ -764,7 +740,6 @@ 3.2.0 3.2.1 3.2.1.3.2.7171000.0-3 - 3.2.1-databricks 3.2.2 3.2.3 3.2.4 @@ -782,7 +757,7 @@ 3.3.2-databricks 3.4.1-databricks 3.5.0 - 3.5.1-SNAPSHOT + 3.5.1 3.12.4 4.3.0 3.1.1 @@ -831,13 +806,12 @@ 340, 341, 342, - 350 + 350, + 351 - 351 - 321db, 330db, 332db, 341db @@ -885,10 +859,10 @@ 340, 341, 342, - 350 + 350, + 351 - 351 ${noSnapshotScala213.buildvers} diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml index da9ea2ccf23..599ba8c226f 100644 --- a/scala2.13/aggregator/pom.xml +++ b/scala2.13/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.02.0 + 24.04.0 ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.13 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.02.0 + 24.04.0 aggregator @@ -369,23 +369,6 @@ - - release321db - - - buildver - 321db - - - - - com.nvidia - rapids-4-spark-delta-spark321db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - release322 @@ -762,5 +745,22 @@ + + release351 + + + buildver + 351 + + + + + com.nvidia + rapids-4-spark-delta-stub_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + diff --git a/scala2.13/api_validation/pom.xml b/scala2.13/api_validation/pom.xml index 20ee55d5843..bc031e59b7c 100644 --- a/scala2.13/api_validation/pom.xml +++ b/scala2.13/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.02.0 + 24.04.0 ../shim-deps/pom.xml rapids-4-spark-api-validation_2.13 - 24.02.0 + 24.04.0 api_validation diff --git a/scala2.13/datagen/pom.xml b/scala2.13/datagen/pom.xml index d95a6b98f4d..ef4ea7f5296 100644 --- a/scala2.13/datagen/pom.xml +++ b/scala2.13/datagen/pom.xml @@ -1,6 +1,6 @@ - - 4.0.0 - - - com.nvidia - rapids-4-spark-jdk-profiles_2.13 - 24.02.0 - ../../jdk-profiles/pom.xml - - - rapids-4-spark-delta-spark321db_2.13 - RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support - Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.02.0 - - - ../delta-lake/delta-spark321db - false - **/* - package - - - - - com.nvidia - rapids-4-spark-sql_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - provided - - - com.nvidia - rapids-4-spark-db-bom - ${project.version} - pom - provided - - - - - - - org.codehaus.mojo - build-helper-maven-plugin - - - add-common-sources - generate-sources - - add-source - - - - - - ${project.basedir}/../../${rapids.module}/../common/src/main/scala - ${project.basedir}/../../${rapids.module}/../common/src/main/databricks/scala - - - - - - - - net.alchim31.maven - scala-maven-plugin - - - org.apache.rat - apache-rat-plugin - - - - diff --git a/scala2.13/delta-lake/delta-spark330db/pom.xml b/scala2.13/delta-lake/delta-spark330db/pom.xml index abc2ecce7b9..6b4264eb402 100644 --- a/scala2.13/delta-lake/delta-spark330db/pom.xml +++ b/scala2.13/delta-lake/delta-spark330db/pom.xml @@ -1,6 +1,6 @@ - release321db - - - buildver - 321db - - - - 321db - - 3.4.4 - spark321db - ${spark321db.version} - ${spark321db.version} - 3.3.1 - true - 1.12.0 - - - shim-deps/databricks - delta-lake/delta-spark321db - - release330 @@ -722,8 +698,8 @@ spark${buildver} cuda11 ${cuda.version} - 24.02.1 - 24.02.1 + 24.04.0 + 24.04.1 2.13 2.8.0 incremental @@ -764,7 +740,6 @@ 3.2.0 3.2.1 3.2.1.3.2.7171000.0-3 - 3.2.1-databricks 3.2.2 3.2.3 3.2.4 @@ -782,7 +757,7 @@ 3.3.2-databricks 3.4.1-databricks 3.5.0 - 3.5.1-SNAPSHOT + 3.5.1 3.12.4 4.3.0 3.1.1 @@ -831,13 +806,12 @@ 340, 341, 342, - 350 + 350, + 351 - 351 - 321db, 330db, 332db, 341db @@ -885,10 +859,10 @@ 340, 341, 342, - 350 + 350, + 351 - 351 ${noSnapshotScala213.buildvers} diff --git a/scala2.13/shim-deps/cloudera/pom.xml b/scala2.13/shim-deps/cloudera/pom.xml index 70c9bb7d07c..94be04cc3cd 100644 --- a/scala2.13/shim-deps/cloudera/pom.xml +++ b/scala2.13/shim-deps/cloudera/pom.xml @@ -1,6 +1,6 @@ + + 4.0.0 + + + com.nvidia + rapids-4-spark-jdk-profiles_2.13 + 24.04.0 + ../jdk-profiles/pom.xml + + rapids-4-spark-tools-support + pom + RAPIDS Accelerator for Apache Spark Tools Support + Supporting code for RAPIDS Accelerator tools + 24.04.0 + + + com.nvidia + rapids-4-spark-aggregator_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + compile + + + + + + ${project.basedir}/../../tools/generated_files/${buildver} + + none + + + + pre-merge + + ${buildver} + + + + + org.codehaus.mojo + exec-maven-plugin + + + if_modified_files + verify + + exec + + + bash + -c 'export MODIFIED=$(git status --porcelain | grep "^ M"); [[ -z $MODIFIED ]] && exit 0 || { echo -e "found modified files during mvn verify:\n$MODIFIED"; exit 1;}' + + + + + + + + + + + + org.apache.maven.plugins + maven-antrun-plugin + + + generate_tools_data + package + + run + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + com.nvidia + rapids-4-spark-aggregator_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + + + org.apache.curator + curator-recipes + + + + + org.apache.spark + spark-avro_${scala.binary.version} + ${spark.version} + + + + + + diff --git a/scala2.13/udf-compiler/pom.xml b/scala2.13/udf-compiler/pom.xml index 5ea5af1f15f..204272405d8 100644 --- a/scala2.13/udf-compiler/pom.xml +++ b/scala2.13/udf-compiler/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.02.0 + 24.04.0 ../shim-deps/pom.xml rapids-4-spark-udf_2.13 RAPIDS Accelerator for Apache Spark Scala UDF Plugin The RAPIDS Scala UDF plugin for Apache Spark - 24.02.0 + 24.04.0 udf-compiler diff --git a/shim-deps/cloudera/pom.xml b/shim-deps/cloudera/pom.xml index bfeef2c6e53..dfa99c1683d 100644 --- a/shim-deps/cloudera/pom.xml +++ b/shim-deps/cloudera/pom.xml @@ -1,6 +1,6 @@ + + 4.0.0 + + + com.nvidia + rapids-4-spark-jdk-profiles_2.12 + 24.04.0 + ../jdk-profiles/pom.xml + + rapids-4-spark-tools-support + pom + RAPIDS Accelerator for Apache Spark Tools Support + Supporting code for RAPIDS Accelerator tools + 24.04.0 + + + com.nvidia + rapids-4-spark-aggregator_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + compile + + + + + ${project.basedir}/generated_files/${buildver} + + + none + + + + pre-merge + + ${buildver} + + + + + org.codehaus.mojo + exec-maven-plugin + + + if_modified_files + verify + + exec + + + bash + -c 'export MODIFIED=$(git status --porcelain | grep "^ M"); [[ -z $MODIFIED ]] && exit 0 || { echo -e "found modified files during mvn verify:\n$MODIFIED"; exit 1;}' + + + + + + + + + + + + org.apache.maven.plugins + maven-antrun-plugin + + + generate_tools_data + package + + run + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + com.nvidia + rapids-4-spark-aggregator_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + + + org.apache.curator + curator-recipes + + + + + org.apache.spark + spark-avro_${scala.binary.version} + ${spark.version} + + + + + + diff --git a/udf-compiler/pom.xml b/udf-compiler/pom.xml index 34f0a75f07a..85aabf7dfd4 100644 --- a/udf-compiler/pom.xml +++ b/udf-compiler/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.02.0 + 24.04.0 ../shim-deps/pom.xml rapids-4-spark-udf_2.12 RAPIDS Accelerator for Apache Spark Scala UDF Plugin The RAPIDS Scala UDF plugin for Apache Spark - 24.02.0 + 24.04.0 udf-compiler