delta-io · sebastiantia · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs
@@ -2,15 +2,15 @@
 //! files.
 
 use crate::actions::{get_log_schema, Metadata, Protocol, METADATA_NAME, PROTOCOL_NAME};
-use crate::path::ParsedLogPath;
+use crate::path::{LogPathFileType, ParsedLogPath};
 use crate::schema::SchemaRef;
 use crate::snapshot::CheckpointMetadata;
 use crate::utils::require;
 use crate::{
     DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileSystemClient, Version,
 };
-use itertools::Itertools;
-use std::cmp::Ordering;
+use itertools::{process_results, Itertools};
+use std::collections::HashMap;
 use std::convert::identity;
 use std::sync::{Arc, LazyLock};
 use tracing::warn;
@@ -311,31 +311,71 @@ fn list_log_files_with_version(
     // on config at some point
     let mut commit_files = Vec::with_capacity(10);
     let mut checkpoint_parts = vec![];
-    let mut max_checkpoint_version = start_version;
 
-    for parsed_path in list_log_files(fs_client, log_root, start_version, end_version)? {
-        let parsed_path = parsed_path?;
-        if parsed_path.is_commit() {
-            commit_files.push(parsed_path);
-        } else if parsed_path.is_checkpoint() {
-            let path_version = parsed_path.version;
-            match max_checkpoint_version {
-                None => {
-                    checkpoint_parts.push(parsed_path);
-                    max_checkpoint_version = Some(path_version);
+    let log_files = list_log_files(fs_client, log_root, start_version, end_version)?;
+
+    process_results(log_files, |iter| {
+        let log_files = iter.chunk_by(move |x| x.version);
+        for (version, files) in &log_files {
+            let mut new_checkpoint_parts = vec![];
+            for file in files {
+                if file.is_commit() {
+                    commit_files.push(file);
+                } else if file.is_checkpoint() {
+                    new_checkpoint_parts.push(file);
                 }
-                Some(checkpoint_version) => match path_version.cmp(&checkpoint_version) {
-                    Ordering::Greater => {
-                        max_checkpoint_version = Some(path_version);
-                        checkpoint_parts.clear();
-                        checkpoint_parts.push(parsed_path);
+            }
+
+            // Group checkpoint parts by the number of parts they have
+            let mut checkpoints = HashMap::new();
+            for part_file in new_checkpoint_parts {
+                use LogPathFileType::*;
+                match &part_file.file_type {
+                    SinglePartCheckpoint
+                    | UuidCheckpoint(_)
+                    | MultiPartCheckpoint {
+                        part_num: 1,
+                        num_parts: 1,
+                    } => {
+                        // All single-file checkpoints are equivalent, just keep one
+                        checkpoints.insert(1, vec![part_file]);
                     }
-                    Ordering::Equal => checkpoint_parts.push(parsed_path),
-                    Ordering::Less => {}
-                },
+                    MultiPartCheckpoint {
+                        part_num: 1,
+                        num_parts,
+                    } => {
+                        // Start a new multi-part checkpoint with at least 2 parts
+                        checkpoints.insert(*num_parts, vec![part_file]);
+                    }
+                    MultiPartCheckpoint {
+                        part_num,
+                        num_parts,
+                    } => {
+                        // Continue a new multi-part checkpoint with at least 2 parts
+                        if let Some(part_files) = checkpoints.get_mut(num_parts) {
+                            if *part_num == 1 + part_files.len() as u32 {
+                                // Safe to append because all previous parts exist
+                                part_files.push(part_file);
+                            }
+                        }
+                    }
+                    Commit | CompactedCommit { .. } | Unknown => {} // invalid file type => do nothing
+                }
+            }
+
+            // Find a complete checkpoint (all parts exist)
+            if let Some((_, complete_checkpoint)) = checkpoints
+                .into_iter()
+                .find(|(num_parts, part_files)| part_files.len() as u32 == *num_parts)
+            {
+                // Validate the checkpoint before updating state
+                if validate_checkpoint_parts(version, &complete_checkpoint) {
+                    checkpoint_parts = complete_checkpoint;
+                    commit_files.clear(); // Clear commit files once checkpoint is found
+                }
             }
         }
-    }
+    })?;
 
     Ok((commit_files, checkpoint_parts))
 }
@@ -377,3 +417,56 @@ fn list_log_files_with_checkpoint(
     }
     Ok((commit_files, checkpoint_parts))
 }
+
+/// Validates that all the checkpoint parts belong to the same checkpoint version and that all parts
+/// are present. Returns `true` if we have a complete checkpoint, `false` otherwise.
+fn validate_checkpoint_parts(version: u64, checkpoint_parts: &[ParsedLogPath]) -> bool {
+    match checkpoint_parts.last().map(|file| &file.file_type) {
+        Some(LogPathFileType::MultiPartCheckpoint { num_parts, .. }) => {
+            if *num_parts as usize != checkpoint_parts.len() {
+                warn!(
+                    "Found a multi-part checkpoint at version {}. Found {} parts, expected {}",
+                    version,
+                    checkpoint_parts.len(),
+                    num_parts
+                );
+                return false;
+            }
+        }
+        Some(LogPathFileType::SinglePartCheckpoint) => {
+            if checkpoint_parts.len() != 1 {
+                warn!(
+                    "Found a single-part checkpoint at version {}. Found {} parts",
+                    version,
+                    checkpoint_parts.len()
+                );
+                return false;
+            }
+        }
+        Some(LogPathFileType::UuidCheckpoint(_)) => {
+            warn!(
+                "Found a UUID checkpoint at version {} when it is not supported",
+                version
+            );
+            return false;
+        }
+        Some(LogPathFileType::Commit) | Some(LogPathFileType::CompactedCommit { .. }) => {
+            warn!(
+                "Found a commit file at version {} when expecting a checkpoint",
+                version
+            );
+            return false;
+        }
+        Some(LogPathFileType::Unknown) => {
+            warn!(
+                "Found an unknown file type at version {} when expecting a checkpoint",
+                version
+            );
+            return false;
+        }
+        // No checkpoint parts
+        None => return false,
+    }
+
+    true
+}
diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs
@@ -107,6 +107,73 @@ fn build_log_with_paths_and_checkpoint(
     (Box::new(client), log_root)
 }
 
+#[test]
+fn build_snapshot_with_unsupported_uuid_checkpoint() {
+    let (client, log_root) = build_log_with_paths_and_checkpoint(
+        &[
+            delta_path_for_version(0, "json"),
+            delta_path_for_version(1, "checkpoint.parquet"),
+            delta_path_for_version(2, "json"),
+            delta_path_for_version(3, "checkpoint.parquet"),
+            delta_path_for_version(4, "json"),
+            delta_path_for_version(5, "json"),
+            delta_path_for_version(5, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.parquet"),
+            delta_path_for_version(6, "json"),
+            delta_path_for_version(7, "json"),
+        ],
+        None,
+    );
+
+    let log_segment = LogSegment::for_snapshot(client.as_ref(), log_root, None, None).unwrap();
+    let commit_files = log_segment.ascending_commit_files;
+    let checkpoint_parts = log_segment.checkpoint_parts;
+
+    assert_eq!(checkpoint_parts.len(), 1);
+    assert_eq!(checkpoint_parts[0].version, 3);
+
+    let versions = commit_files.into_iter().map(|x| x.version).collect_vec();
+    let expected_versions = vec![4, 5, 6, 7];
+    assert_eq!(versions, expected_versions);
+}
+
+#[test]
+fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() {
+    let (client, log_root) = build_log_with_paths_and_checkpoint(
+        &[
+            delta_path_for_version(0, "json"),
+            delta_path_for_multipart_checkpoint(1, 1, 3),
+            // Part 2 is missing!
+            delta_path_for_multipart_checkpoint(3, 3, 3),
+            // Part 1 is missing!
+            delta_path_for_multipart_checkpoint(2, 1, 2),
+            delta_path_for_version(2, "json"),
+            delta_path_for_multipart_checkpoint(3, 1, 3),
+            // Part 2 is missing!
+            delta_path_for_multipart_checkpoint(3, 3, 3),
+            delta_path_for_multipart_checkpoint(3, 1, 4),
+            delta_path_for_multipart_checkpoint(3, 2, 4),
+            delta_path_for_multipart_checkpoint(3, 3, 4),
+            delta_path_for_multipart_checkpoint(3, 4, 4),
+            delta_path_for_version(4, "json"),
+            delta_path_for_version(5, "json"),
+            delta_path_for_version(6, "json"),
+            delta_path_for_version(7, "json"),
+        ],
+        None,
+    );
+
+    let log_segment = LogSegment::for_snapshot(client.as_ref(), log_root, None, None).unwrap();
+    let commit_files = log_segment.ascending_commit_files;
+    let checkpoint_parts = log_segment.checkpoint_parts;
+
+    assert_eq!(checkpoint_parts.len(), 4);
+    assert_eq!(checkpoint_parts[0].version, 3);
+
+    let versions = commit_files.into_iter().map(|x| x.version).collect_vec();
+    let expected_versions = vec![4, 5, 6, 7];
+    assert_eq!(versions, expected_versions);
+}
+
 #[test]
 fn build_snapshot_with_out_of_date_last_checkpoint() {
     let checkpoint_metadata = CheckpointMetadata {
@@ -257,11 +324,8 @@ fn build_snapshot_with_bad_checkpoint_hint_fails() {
     assert!(log_segment.is_err())
 }
 
-#[ignore]
 #[test]
 fn build_snapshot_with_missing_checkpoint_part_no_hint() {
-    // TODO: Handle checkpoints correctly so that this test passes: https://github.com/delta-io/delta-kernel-rs/issues/497
-
     // Part 2 of 3 is missing from checkpoint 5. The Snapshot should be made of checkpoint
     // number 3 and commit files 4 to 7.
     let (client, log_root) = build_log_with_paths_and_checkpoint(
@@ -296,6 +360,51 @@ fn build_snapshot_with_missing_checkpoint_part_no_hint() {
     assert_eq!(versions, expected_versions);
 }
 
+#[test]
+fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpoint() {
+    // When the _last_checkpoint is out of date and the most recent checkpoint is incomplete, the
+    // Snapshot should be made of the most recent complete checkpoint and the commit files that
+    // follow it.
+    let checkpoint_metadata = CheckpointMetadata {
+        version: 3,
+        size: 10,
+        parts: None,
+        size_in_bytes: None,
+        num_of_add_files: None,
+        checkpoint_schema: None,
+        checksum: None,
+    };
+
+    let (client, log_root) = build_log_with_paths_and_checkpoint(
+        &[
+            delta_path_for_version(0, "json"),
+            delta_path_for_version(1, "checkpoint.parquet"),
+            delta_path_for_version(2, "json"),
+            delta_path_for_version(3, "checkpoint.parquet"),
+            delta_path_for_version(4, "json"),
+            delta_path_for_multipart_checkpoint(5, 1, 3),
+            // Part 2 is missing!
+            delta_path_for_multipart_checkpoint(5, 3, 3),
+            delta_path_for_version(5, "json"),
+            delta_path_for_version(6, "json"),
+            delta_path_for_version(7, "json"),
+        ],
+        Some(&checkpoint_metadata),
+    );
+
+    let log_segment =
+        LogSegment::for_snapshot(client.as_ref(), log_root, checkpoint_metadata, None).unwrap();
+    let commit_files = log_segment.ascending_commit_files;
+    let checkpoint_parts = log_segment.checkpoint_parts;
+
+    assert_eq!(checkpoint_parts.len(), 1);
+    assert_eq!(checkpoint_parts[0].version, 3);
+
+    let versions = commit_files.into_iter().map(|x| x.version).collect_vec();
+    let expected_versions = vec![4, 5, 6, 7];
+    assert_eq!(versions, expected_versions);
+}
+
 #[test]
 fn build_snapshot_without_checkpoints() {
     let (client, log_root) = build_log_with_paths_and_checkpoint(