-
Notifications
You must be signed in to change notification settings - Fork 289
feat(datafusion): Support insert_into in IcebergTableProvider #1511
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
4e12e6e
b756b34
e37d91a
39774df
f4a76dd
61bd43c
5c4145a
01dad31
638df22
0d1e202
f65bc65
fa1826e
5e9e7e7
6145dbe
caaa6e6
71d52ff
613f7d9
4dffe98
22d14bf
392ad1a
712ccd5
a141728
c68dda6
b28f15b
0b869a6
e252bf1
2c06cfa
3bf7511
f642bf0
c783ebf
b888cab
315d9a7
d6e3f37
950e5b3
42bf200
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,7 +33,7 @@ use super::{ | |
Datum, FormatVersion, ManifestContentType, PartitionSpec, PrimitiveType, Schema, Struct, | ||
UNASSIGNED_SEQUENCE_NUMBER, | ||
}; | ||
use crate::error::Result; | ||
use crate::error::{Error, ErrorKind, Result}; | ||
|
||
/// A manifest contains metadata and a list of entries. | ||
#[derive(Debug, PartialEq, Eq, Clone)] | ||
|
@@ -119,12 +119,45 @@ impl Manifest { | |
} | ||
} | ||
|
||
/// Serialize a DataFile to a JSON string. | ||
pub fn serialize_data_file_to_json( | ||
data_file: DataFile, | ||
partition_type: &super::StructType, | ||
format_version: FormatVersion, | ||
) -> Result<String> { | ||
let serde = _serde::DataFileSerde::try_from(data_file, partition_type, format_version)?; | ||
serde_json::to_string(&serde).map_err(|e| { | ||
Error::new( | ||
ErrorKind::DataInvalid, | ||
format!("Failed to serialize DataFile to JSON: {}", e), | ||
) | ||
}) | ||
} | ||
|
||
/// Deserialize a DataFile from a JSON string. | ||
pub fn deserialize_data_file_from_json( | ||
json: &str, | ||
partition_spec_id: i32, | ||
partition_type: &super::StructType, | ||
schema: &Schema, | ||
) -> Result<DataFile> { | ||
let serde = serde_json::from_str::<_serde::DataFileSerde>(json).map_err(|e| { | ||
Error::new( | ||
ErrorKind::DataInvalid, | ||
format!("Failed to deserialize JSON to DataFile: {}", e), | ||
) | ||
})?; | ||
|
||
serde.try_into(partition_spec_id, partition_type, schema) | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use std::collections::HashMap; | ||
use std::fs; | ||
use std::sync::Arc; | ||
|
||
use arrow_array::StringArray; | ||
use tempfile::TempDir; | ||
|
||
use super::*; | ||
|
@@ -1056,4 +1089,120 @@ mod tests { | |
assert!(!partitions[2].clone().contains_null); | ||
assert_eq!(partitions[2].clone().contains_nan, Some(false)); | ||
} | ||
|
||
#[test] | ||
fn test_data_file_serialization() { | ||
// Create a simple schema | ||
let schema = Schema::builder() | ||
.with_schema_id(1) | ||
.with_identifier_field_ids(vec![1]) | ||
.with_fields(vec![ | ||
crate::spec::NestedField::required(1, "id", Type::Primitive(PrimitiveType::Long)) | ||
.into(), | ||
crate::spec::NestedField::required( | ||
2, | ||
"name", | ||
Type::Primitive(PrimitiveType::String), | ||
) | ||
.into(), | ||
]) | ||
.build() | ||
.unwrap(); | ||
|
||
// Create a partition spec | ||
let partition_spec = PartitionSpec::builder(schema.clone()) | ||
.with_spec_id(1) | ||
.add_partition_field("id", "id_partition", crate::spec::Transform::Identity) | ||
.unwrap() | ||
.build() | ||
.unwrap(); | ||
|
||
// Get partition type from the partition spec | ||
let partition_type = partition_spec.partition_type(&schema).unwrap(); | ||
|
||
// Create a vector of DataFile objects | ||
let data_files = vec![ | ||
DataFileBuilder::default() | ||
.content(crate::spec::DataContentType::Data) | ||
.file_format(DataFileFormat::Parquet) | ||
.file_path("path/to/file1.parquet".to_string()) | ||
.file_size_in_bytes(1024) | ||
.record_count(100) | ||
.partition_spec_id(1) | ||
.partition(Struct::empty()) | ||
.column_sizes(HashMap::from([(1, 512), (2, 512)])) | ||
.value_counts(HashMap::from([(1, 100), (2, 100)])) | ||
.null_value_counts(HashMap::from([(1, 0), (2, 0)])) | ||
.build() | ||
.unwrap(), | ||
DataFileBuilder::default() | ||
.content(crate::spec::DataContentType::Data) | ||
.file_format(DataFileFormat::Parquet) | ||
.file_path("path/to/file2.parquet".to_string()) | ||
.file_size_in_bytes(2048) | ||
.record_count(200) | ||
.partition_spec_id(1) | ||
.partition(Struct::empty()) | ||
.column_sizes(HashMap::from([(1, 1024), (2, 1024)])) | ||
.value_counts(HashMap::from([(1, 200), (2, 200)])) | ||
.null_value_counts(HashMap::from([(1, 10), (2, 5)])) | ||
.build() | ||
.unwrap(), | ||
]; | ||
|
||
// Serialize the DataFile objects | ||
let serialized_files = data_files | ||
.into_iter() | ||
.map(|f| { | ||
let json = | ||
serialize_data_file_to_json(f, &partition_type, FormatVersion::V2).unwrap(); | ||
println!("Test serialized data file: {}", json); | ||
json | ||
}) | ||
.collect::<Vec<String>>(); | ||
|
||
// Verify we have the expected number of serialized files | ||
assert_eq!(serialized_files.len(), 2); | ||
|
||
// Verify each serialized file contains expected data | ||
for json in &serialized_files { | ||
assert!(json.contains("path/to/file")); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Why not assert the json output? We could use snapshot test to make it easier, see https://docs.rs/expect-test/latest/expect_test/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a snapshot test makes more sense There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've created a new PR to address the DataFileSerde-related changes separately |
||
assert!(json.contains("parquet")); | ||
assert!(json.contains("record_count")); | ||
assert!(json.contains("file_size_in_bytes")); | ||
} | ||
|
||
// Convert Vec<String> to StringArray and print it | ||
let string_array = StringArray::from(serialized_files.clone()); | ||
println!("StringArray: {:?}", string_array); | ||
|
||
// Now deserialize the JSON strings back into DataFile objects | ||
println!("\nDeserializing back to DataFile objects:"); | ||
let deserialized_files: Vec<DataFile> = serialized_files | ||
.into_iter() | ||
.map(|json| { | ||
let data_file = deserialize_data_file_from_json( | ||
&json, | ||
partition_spec.spec_id(), | ||
&partition_type, | ||
&schema, | ||
) | ||
.unwrap(); | ||
|
||
println!("Deserialized DataFile: {:?}", data_file); | ||
data_file | ||
}) | ||
.collect(); | ||
|
||
// Verify we have the expected number of deserialized files | ||
assert_eq!(deserialized_files.len(), 2); | ||
|
||
// Verify the deserialized files have the expected properties | ||
for file in &deserialized_files { | ||
assert_eq!(file.content_type(), crate::spec::DataContentType::Data); | ||
assert_eq!(file.file_format(), DataFileFormat::Parquet); | ||
assert!(file.file_path().contains("path/to/file")); | ||
assert!(file.record_count() == 100 || file.record_count() == 200); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should use log here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is for testing only, and I'm planning to remove these log lines