Skip to content

feat(datafusion): Support insert_into in IcebergTableProvider #1511

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 35 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
4e12e6e
Support Datafusion insert_into
CTTY Jun 26, 2025
b756b34
cleanup
CTTY Jul 15, 2025
e37d91a
minor
CTTY Jul 15, 2025
39774df
minor
CTTY Jul 15, 2025
f4a76dd
clippy ftw
CTTY Jul 15, 2025
61bd43c
minor
CTTY Jul 16, 2025
5c4145a
minor
CTTY Jul 16, 2025
01dad31
i luv cleaning up
CTTY Jul 16, 2025
638df22
fmt not working?
CTTY Jul 16, 2025
0d1e202
do not expose serde
CTTY Jul 16, 2025
f65bc65
cut it down
CTTY Jul 16, 2025
fa1826e
Use stricter wrapper data file wrapper
CTTY Jul 16, 2025
5e9e7e7
fix partitioning, and fmt ofc
CTTY Jul 16, 2025
6145dbe
minor
CTTY Jul 17, 2025
caaa6e6
partitioned shall not pass
CTTY Jul 17, 2025
71d52ff
implement children and with_new_children for write node, fix fmt
CTTY Jul 17, 2025
613f7d9
get row counts from data files directly
CTTY Jul 17, 2025
4dffe98
Update crates/integrations/datafusion/src/physical_plan/write.rs
CTTY Jul 21, 2025
22d14bf
Update crates/integrations/datafusion/src/physical_plan/commit.rs
CTTY Jul 21, 2025
392ad1a
fix fmt, input boundedness
CTTY Jul 21, 2025
712ccd5
make data_files constant
CTTY Jul 21, 2025
a141728
use format version when serde datafiles
CTTY Jul 21, 2025
c68dda6
use try_new instead
CTTY Jul 21, 2025
b28f15b
minor
CTTY Jul 21, 2025
0b869a6
coalesce partitions
CTTY Jul 21, 2025
e252bf1
minor
CTTY Jul 21, 2025
2c06cfa
fmt
CTTY Jul 21, 2025
3bf7511
rolling
CTTY Jul 22, 2025
f642bf0
rolling in the deep
CTTY Jul 22, 2025
c783ebf
rolls the unit tests
CTTY Jul 22, 2025
b888cab
could have it all for tests
CTTY Jul 22, 2025
315d9a7
new rolling
CTTY Jul 29, 2025
d6e3f37
rebase and clean up
CTTY Jul 29, 2025
950e5b3
uncomment catalog commit
CTTY Jul 29, 2025
42bf200
cleaner
CTTY Jul 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions crates/iceberg/src/arrow/nan_val_cnt_visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ impl NanValueCountVisitor {
let arrow_arr_partner_accessor = ArrowArrayAccessor {};

let struct_arr = Arc::new(StructArray::from(batch)) as ArrayRef;
// todo remove these log lines
println!("----StructArray from record stream: {:?}", struct_arr);
println!("----Schema.as_struct from table: {:?}", schema.as_struct());
Comment on lines +163 to +164
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use log here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for testing only, and I'm planning to remove these log lines

visit_struct_with_partner(
schema.as_struct(),
&struct_arr,
Expand Down
11 changes: 10 additions & 1 deletion crates/iceberg/src/arrow/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -463,10 +463,19 @@ impl PartnerAccessor<ArrayRef> for ArrowArrayAccessor {
.map(|id| id == field.id)
.unwrap_or(false)
})
.or_else(|| {
struct_array
.fields()
.iter()
.position(|arrow_field| arrow_field.name().clone() == field.name)
})
.ok_or_else(|| {
Error::new(
ErrorKind::DataInvalid,
format!("Field id {} not found in struct array", field.id),
format!(
"Field with id={} or name={} not found in struct array",
field.id, field.name
),
)
})?;

Expand Down
14 changes: 9 additions & 5 deletions crates/iceberg/src/spec/manifest/_serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use serde_derive::{Deserialize, Serialize};
use serde_with::serde_as;

use super::{Datum, ManifestEntry, Schema, Struct};
use crate::spec::{Literal, RawLiteral, StructType, Type};
use crate::spec::{FormatVersion, Literal, RawLiteral, StructType, Type};
use crate::{Error, ErrorKind};

#[derive(Serialize, Deserialize)]
Expand All @@ -40,7 +40,7 @@ impl ManifestEntryV2 {
snapshot_id: value.snapshot_id,
sequence_number: value.sequence_number,
file_sequence_number: value.file_sequence_number,
data_file: DataFileSerde::try_from(value.data_file, partition_type, false)?,
data_file: DataFileSerde::try_from(value.data_file, partition_type, FormatVersion::V2)?,
})
}

Expand Down Expand Up @@ -74,7 +74,7 @@ impl ManifestEntryV1 {
Ok(Self {
status: value.status as i32,
snapshot_id: value.snapshot_id.unwrap_or_default(),
data_file: DataFileSerde::try_from(value.data_file, partition_type, true)?,
data_file: DataFileSerde::try_from(value.data_file, partition_type, FormatVersion::V1)?,
})
}

Expand Down Expand Up @@ -129,9 +129,13 @@ impl DataFileSerde {
pub fn try_from(
value: super::DataFile,
partition_type: &StructType,
is_version_1: bool,
format_version: FormatVersion,
) -> Result<Self, Error> {
let block_size_in_bytes = if is_version_1 { Some(0) } else { None };
let block_size_in_bytes = if format_version == FormatVersion::V1 {
Some(0)
} else {
None
};
Ok(Self {
content: value.content as i32,
file_path: value.file_path,
Expand Down
8 changes: 6 additions & 2 deletions crates/iceberg/src/spec/manifest/data_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,12 @@ pub fn write_data_files_to_avro<W: Write>(
let mut writer = AvroWriter::new(&avro_schema, writer);

for data_file in data_files {
let value = to_value(DataFileSerde::try_from(data_file, partition_type, true)?)?
.resolve(&avro_schema)?;
let value = to_value(DataFileSerde::try_from(
data_file,
partition_type,
FormatVersion::V1,
)?)?
.resolve(&avro_schema)?;
writer.append(value)?;
}

Expand Down
151 changes: 150 additions & 1 deletion crates/iceberg/src/spec/manifest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use super::{
Datum, FormatVersion, ManifestContentType, PartitionSpec, PrimitiveType, Schema, Struct,
UNASSIGNED_SEQUENCE_NUMBER,
};
use crate::error::Result;
use crate::error::{Error, ErrorKind, Result};

/// A manifest contains metadata and a list of entries.
#[derive(Debug, PartialEq, Eq, Clone)]
Expand Down Expand Up @@ -119,12 +119,45 @@ impl Manifest {
}
}

/// Serialize a DataFile to a JSON string.
pub fn serialize_data_file_to_json(
data_file: DataFile,
partition_type: &super::StructType,
format_version: FormatVersion,
) -> Result<String> {
let serde = _serde::DataFileSerde::try_from(data_file, partition_type, format_version)?;
serde_json::to_string(&serde).map_err(|e| {
Error::new(
ErrorKind::DataInvalid,
format!("Failed to serialize DataFile to JSON: {}", e),
)
})
}

/// Deserialize a DataFile from a JSON string.
pub fn deserialize_data_file_from_json(
json: &str,
partition_spec_id: i32,
partition_type: &super::StructType,
schema: &Schema,
) -> Result<DataFile> {
let serde = serde_json::from_str::<_serde::DataFileSerde>(json).map_err(|e| {
Error::new(
ErrorKind::DataInvalid,
format!("Failed to deserialize JSON to DataFile: {}", e),
)
})?;

serde.try_into(partition_spec_id, partition_type, schema)
}

#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::fs;
use std::sync::Arc;

use arrow_array::StringArray;
use tempfile::TempDir;

use super::*;
Expand Down Expand Up @@ -1056,4 +1089,120 @@ mod tests {
assert!(!partitions[2].clone().contains_null);
assert_eq!(partitions[2].clone().contains_nan, Some(false));
}

#[test]
fn test_data_file_serialization() {
// Create a simple schema
let schema = Schema::builder()
.with_schema_id(1)
.with_identifier_field_ids(vec![1])
.with_fields(vec![
crate::spec::NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long))
.into(),
crate::spec::NestedField::required(
2,
"name",
Type::Primitive(PrimitiveType::String),
)
.into(),
])
.build()
.unwrap();

// Create a partition spec
let partition_spec = PartitionSpec::builder(schema.clone())
.with_spec_id(1)
.add_partition_field("id", "id_partition", crate::spec::Transform::Identity)
.unwrap()
.build()
.unwrap();

// Get partition type from the partition spec
let partition_type = partition_spec.partition_type(&schema).unwrap();

// Create a vector of DataFile objects
let data_files = vec![
DataFileBuilder::default()
.content(crate::spec::DataContentType::Data)
.file_format(DataFileFormat::Parquet)
.file_path("path/to/file1.parquet".to_string())
.file_size_in_bytes(1024)
.record_count(100)
.partition_spec_id(1)
.partition(Struct::empty())
.column_sizes(HashMap::from([(1, 512), (2, 512)]))
.value_counts(HashMap::from([(1, 100), (2, 100)]))
.null_value_counts(HashMap::from([(1, 0), (2, 0)]))
.build()
.unwrap(),
DataFileBuilder::default()
.content(crate::spec::DataContentType::Data)
.file_format(DataFileFormat::Parquet)
.file_path("path/to/file2.parquet".to_string())
.file_size_in_bytes(2048)
.record_count(200)
.partition_spec_id(1)
.partition(Struct::empty())
.column_sizes(HashMap::from([(1, 1024), (2, 1024)]))
.value_counts(HashMap::from([(1, 200), (2, 200)]))
.null_value_counts(HashMap::from([(1, 10), (2, 5)]))
.build()
.unwrap(),
];

// Serialize the DataFile objects
let serialized_files = data_files
.into_iter()
.map(|f| {
let json =
serialize_data_file_to_json(f, &partition_type, FormatVersion::V2).unwrap();
println!("Test serialized data file: {}", json);
json
})
.collect::<Vec<String>>();

// Verify we have the expected number of serialized files
assert_eq!(serialized_files.len(), 2);

// Verify each serialized file contains expected data
for json in &serialized_files {
assert!(json.contains("path/to/file"));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Why not assert the json output? We could use snapshot test to make it easier, see https://docs.rs/expect-test/latest/expect_test/

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a snapshot test makes more sense

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've created a new PR to address the DataFileSerde-related changes separately

assert!(json.contains("parquet"));
assert!(json.contains("record_count"));
assert!(json.contains("file_size_in_bytes"));
}

// Convert Vec<String> to StringArray and print it
let string_array = StringArray::from(serialized_files.clone());
println!("StringArray: {:?}", string_array);

// Now deserialize the JSON strings back into DataFile objects
println!("\nDeserializing back to DataFile objects:");
let deserialized_files: Vec<DataFile> = serialized_files
.into_iter()
.map(|json| {
let data_file = deserialize_data_file_from_json(
&json,
partition_spec.spec_id(),
&partition_type,
&schema,
)
.unwrap();

println!("Deserialized DataFile: {:?}", data_file);
data_file
})
.collect();

// Verify we have the expected number of deserialized files
assert_eq!(deserialized_files.len(), 2);

// Verify the deserialized files have the expected properties
for file in &deserialized_files {
assert_eq!(file.content_type(), crate::spec::DataContentType::Data);
assert_eq!(file.file_format(), DataFileFormat::Parquet);
assert!(file.file_path().contains("path/to/file"));
assert!(file.record_count() == 100 || file.record_count() == 200);
}
}
}
7 changes: 7 additions & 0 deletions crates/iceberg/src/spec/table_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ pub const PROPERTY_COMMIT_TOTAL_RETRY_TIME_MS: &str = "commit.retry.total-timeou
/// Default value for total maximum retry time (ms).
pub const PROPERTY_COMMIT_TOTAL_RETRY_TIME_MS_DEFAULT: u64 = 30 * 60 * 1000; // 30 minutes

/// Default file format for data files
pub const PROPERTY_DEFAULT_FILE_FORMAT: &str = "write.format.default";
/// Default file format for delete files
pub const PROPERTY_DELETE_DEFAULT_FILE_FORMAT: &str = "write.delete.format.default";
/// Default value for data file format
pub const PROPERTY_DEFAULT_FILE_FORMAT_DEFAULT: &str = "parquet";

/// Reference to [`TableMetadata`].
pub type TableMetadataRef = Arc<TableMetadata>;

Expand Down
2 changes: 2 additions & 0 deletions crates/integrations/datafusion/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ async-trait = { workspace = true }
datafusion = { workspace = true }
futures = { workspace = true }
iceberg = { workspace = true }
parquet = { workspace = true }
tokio = { workspace = true }
uuid = { workspace = true }

[dev-dependencies]
expect-test = { workspace = true }
Expand Down
Loading
Loading