Skip to content

Commit

Permalink
remove stat field from remove actions
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastiantia committed Jan 14, 2025
1 parent 7b43b35 commit bd932d6
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 50 deletions.
7 changes: 0 additions & 7 deletions kernel/src/actions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -442,12 +442,6 @@ struct Remove {
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
pub(crate) size: Option<i64>,

/// Contains [statistics] (e.g., count, min/max values for columns) about the data in this logical file encoded as a JSON string.
///
/// [statistics]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Per-file-Statistics
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
pub(crate) stats: Option<String>,

/// Map containing metadata about this logical file.
#[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))]
pub(crate) tags: Option<HashMap<String, String>>,
Expand Down Expand Up @@ -639,7 +633,6 @@ mod tests {
StructField::new("extendedFileMetadata", DataType::BOOLEAN, true),
partition_values_field(),
StructField::new("size", DataType::LONG, true),
StructField::new("stats", DataType::STRING, true),
tags_field(),
deletion_vector_field(),
StructField::new("baseRowId", DataType::LONG, true),
Expand Down
47 changes: 5 additions & 42 deletions kernel/src/actions/visitors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ impl RemoveVisitor {
getters: &[&'a dyn GetData<'a>],
) -> DeltaResult<Remove> {
require!(
getters.len() == 15,
getters.len() == 14,
Error::InternalError(format!(
"Wrong number of RemoveVisitor getters: {}",
getters.len()
Expand All @@ -272,15 +272,13 @@ impl RemoveVisitor {

let size: Option<i64> = getters[5].get_opt(row_index, "remove.size")?;

let stats: Option<String> = getters[6].get_opt(row_index, "remove.stats")?;

// TODO(nick) tags are skipped in getters[7]
// TODO(nick) tags are skipped in getters[6]

let deletion_vector = visit_deletion_vector_at(row_index, &getters[8..])?;
let deletion_vector = visit_deletion_vector_at(row_index, &getters[7..])?;

let base_row_id: Option<i64> = getters[13].get_opt(row_index, "remove.baseRowId")?;
let base_row_id: Option<i64> = getters[12].get_opt(row_index, "remove.baseRowId")?;
let default_row_commit_version: Option<i64> =
getters[14].get_opt(row_index, "remove.defaultRowCommitVersion")?;
getters[13].get_opt(row_index, "remove.defaultRowCommitVersion")?;

Ok(Remove {
path,
Expand All @@ -289,7 +287,6 @@ impl RemoveVisitor {
extended_file_metadata,
partition_values,
size,
stats,
tags: None,
deletion_vector,
base_row_id,
Expand Down Expand Up @@ -635,40 +632,6 @@ mod tests {
}
}

#[test]
fn test_parse_remove() {
let engine = SyncEngine::new();
let json_handler = engine.get_json_handler();
let json_strings: StringArray = vec![
r#"{"commitInfo":{"timestamp":1670892998177,"operation":"DELETE","operationParameters":{"mode":"Append"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#,
r#"{"remove":{"path":"part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"size":452,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#,
]
.into();
let output_schema = get_log_schema().clone();
let batch = json_handler
.parse_json(string_array_to_engine_data(json_strings), output_schema)
.unwrap();
let mut remove_visitor = RemoveVisitor::default();
remove_visitor.visit_rows_of(batch.as_ref()).unwrap();
let expected_remove: Remove = Remove {
path: "part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet".into(),
deletion_timestamp: Some(1670892998135),
data_change: true,
size: Some(452),
stats: Some("{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}".into()),
..Default::default()
};
assert_eq!(
remove_visitor.removes.len(),
1,
"Unexpected number of remove actions"
);
assert_eq!(
remove_visitor.removes[0], expected_remove,
"Unexpected remove action"
);
}

#[test]
fn test_parse_remove_partitioned() {
let engine = SyncEngine::new();
Expand Down
2 changes: 1 addition & 1 deletion kernel/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ pub struct StructType {
pub type_name: String,
/// The type of element stored in this array
// We use indexmap to preserve the order of fields as they are defined in the schema
// while also allowing for fast lookup by name. The alternative to do a linear search
// while also allowing for fast lookup by name. The alternative is to do a linear search
// for each field by name would be potentially quite expensive for large schemas.
pub fields: IndexMap<String, StructField>,
}
Expand Down

0 comments on commit bd932d6

Please sign in to comment.