From 2fb0c5537761555c19b49323f2c03da803fa5a74 Mon Sep 17 00:00:00 2001 From: Norbert Orzechowicz Date: Thu, 4 Jun 2026 13:54:28 +0200 Subject: [PATCH 1/2] chore(flow-php/arrow-ext): bump arrow/parquet to 58.3 and ext-php-rs to 0.15.15 - handle new ArrayKey::ZendString variant in MAP key conversion - enable parquet simdutf8 feature - apply clippy/rustfmt idioms across parquet and stream modules - regenerate fixtures with distinct double values --- documentation/contributing/rust.md | 2 +- src/extension/arrow-ext/Cargo.lock | 85 +++--- src/extension/arrow-ext/Cargo.toml | 2 +- src/extension/arrow-ext/build.rs | 9 +- .../arrow-ext/examples/generate_fixtures.rs | 7 +- src/extension/arrow-ext/src/lib.rs | 4 + .../arrow-ext/src/parquet/exception.rs | 3 +- src/extension/arrow-ext/src/parquet/reader.rs | 49 ++-- .../arrow-ext/src/parquet/type_converter.rs | 273 +++++++++--------- src/extension/arrow-ext/src/parquet/writer.rs | 60 ++-- .../src/stream/php_destination_stream.rs | 14 +- .../arrow-ext/src/stream/php_source_stream.rs | 16 +- .../src/stream/random_access_file.rs | 3 +- .../tests/fixtures/all_flat_types.parquet | Bin 4837 -> 4837 bytes .../tests/fixtures/deeply_nested.parquet | Bin 1161 -> 1161 bytes .../arrow-ext/tests/fixtures/nested.parquet | Bin 2290 -> 2290 bytes .../arrow-ext/tests/fixtures/simple.parquet | Bin 492 -> 492 bytes .../tests/phpt/007_reader_all_flat_types.phpt | 2 +- 18 files changed, 260 insertions(+), 269 deletions(-) diff --git a/documentation/contributing/rust.md b/documentation/contributing/rust.md index 5f6eaab372..031e4ac81f 100644 --- a/documentation/contributing/rust.md +++ b/documentation/contributing/rust.md @@ -3,7 +3,7 @@ [TOC] This document describes how to develop the `arrow-ext` PHP extension, which is written in Rust -using the [ext-php-rs](https://github.com/nicedragons/ext-php-rs) framework. +using the [ext-php-rs](https://github.com/extphprs/ext-php-rs) framework. ## Overview diff --git a/src/extension/arrow-ext/Cargo.lock b/src/extension/arrow-ext/Cargo.lock index 838a0ae51c..3086ed4c43 100644 --- a/src/extension/arrow-ext/Cargo.lock +++ b/src/extension/arrow-ext/Cargo.lock @@ -86,9 +86,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "58.0.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e53796e07a6525edaf7dc28b540d477a934aff14af97967ad1d5550878969b9e" +checksum = "cfd33d3e92f207444098c75b42de99d329562be0cf686b307b097cc52b4e999e" dependencies = [ "ahash", "arrow-buffer", @@ -96,7 +96,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "num-complex", "num-integer", "num-traits", @@ -104,9 +104,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "58.0.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2c1a85bb2e94ee10b76531d8bc3ce9b7b4c0d508cabfb17d477f63f2617bd20" +checksum = "0c6cd424c2693bcdbc150d843dc9d4d137dd2de4782ce6df491ad11a3a0416c0" dependencies = [ "bytes", "half", @@ -116,9 +116,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "58.0.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "189d210bc4244c715fa3ed9e6e22864673cccb73d5da28c2723fb2e527329b33" +checksum = "3c88210023a2bfee1896af366309a3028fc3bcbd6515fa29a7990ee1baa08ee0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -129,9 +129,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "58.0.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7968c2e5210c41f4909b2ef76f6e05e172b99021c2def5edf3cc48fdd39d1d6c" +checksum = "238438f0834483703d88896db6fe5a7138b2230debc31b34c0336c2996e3c64f" dependencies = [ "arrow-array", "arrow-buffer", @@ -143,9 +143,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "58.0.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b47e0ca91cc438d2c7879fe95e0bca5329fff28649e30a88c6f760b1faeddcb" +checksum = "f633dbfdf39c039ada1bf9e34c694816eb71fbb7dc78f613993b7245e078a1ed" dependencies = [ "serde_core", "serde_json", @@ -153,9 +153,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "58.0.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "750a7d1dda177735f5e82a314485b6915c7cccdbb278262ac44090f4aba4a325" +checksum = "8cd065c54172ac787cf3f2f8d4107e0d3fdc26edba76fdf4f4cc170258942222" dependencies = [ "ahash", "arrow-array", @@ -331,6 +331,17 @@ dependencies = [ "inout", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "const-random" version = "0.1.18" @@ -519,9 +530,9 @@ dependencies = [ [[package]] name = "ext-php-rs" -version = "0.15.8" +version = "0.15.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cac21b7a62e6b23f4b14241cdda38ef8a321da093ac2cad3fd66775324baa20" +checksum = "abe62f25cd053f5f95dc7bbf94e5b41818ea3cf39b36cc25de8ca6b4de68a0eb" dependencies = [ "anyhow", "bitflags", @@ -541,13 +552,13 @@ dependencies = [ [[package]] name = "ext-php-rs-bindgen" -version = "0.72.1-extphprs.1" +version = "0.72.1-extphprs.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4795dd0976bd7d7d321c49e88e836f8e5b5b2b481e089067e303f2945617458a" +checksum = "32ceb73da3b8b18a3424765dd6926dbc29464a0a7bc31ab491600bfa02adc4bc" dependencies = [ "bitflags", "cexpr", - "ext-php-rs-clang-sys", + "clang-sys", "itertools", "log", "prettyplease", @@ -568,22 +579,11 @@ dependencies = [ "anyhow", ] -[[package]] -name = "ext-php-rs-clang-sys" -version = "1.8.1-extphprs.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa1ad6e482017d457d57d73691f8bed148a8a6198babe90830310c3308480a61" -dependencies = [ - "glob", - "libc", - "libloading", -] - [[package]] name = "ext-php-rs-derive" -version = "0.11.10" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eabe5d508fde240a60d9e4476e9dfab25d6320e32949a3102e509347460864" +checksum = "f57a075f878ad51bcc56745fb21efe601429a509204f8a385bbe526566597320" dependencies = [ "convert_case", "darling", @@ -728,6 +728,12 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + [[package]] name = "heck" version = "0.5.0" @@ -923,9 +929,9 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lz4_flex" -version = "0.12.1" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +checksum = "7ef0d4ed8669f8f8826eb00dc878084aa8f253506c4fd5e8f58f5bce72ddb97e" dependencies = [ "twox-hash", ] @@ -1115,9 +1121,9 @@ dependencies = [ [[package]] name = "parquet" -version = "58.0.0" +version = "58.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f491d0ef1b510194426ee67ddc18a9b747ef3c42050c19322a2cd2e1666c29b" +checksum = "5dafa7d01085b62a47dd0c1829550a0a36710ea9c4fe358a05a85477cec8a908" dependencies = [ "ahash", "arrow-array", @@ -1132,13 +1138,14 @@ dependencies = [ "chrono", "flate2", "half", - "hashbrown 0.16.1", + "hashbrown 0.17.1", "lz4_flex", "num-bigint", "num-integer", "num-traits", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "twox-hash", @@ -1466,6 +1473,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "skeptic" version = "0.13.7" diff --git a/src/extension/arrow-ext/Cargo.toml b/src/extension/arrow-ext/Cargo.toml index 6fc2a56549..4692d6af47 100644 --- a/src/extension/arrow-ext/Cargo.toml +++ b/src/extension/arrow-ext/Cargo.toml @@ -9,7 +9,7 @@ crate-type = ["cdylib"] [dependencies] ext-php-rs = "0.15" -parquet = { version = "58", default-features = false, features = ["arrow", "arrow_canonical_extension_types", "snap", "flate2", "flate2-zlib-rs", "zstd", "lz4", "brotli"] } +parquet = { version = "58", default-features = false, features = ["arrow", "arrow_canonical_extension_types", "snap", "flate2", "flate2-zlib-rs", "zstd", "lz4", "brotli", "simdutf8"] } arrow-array = "58" arrow-buffer = "58" arrow-schema = { version = "58", features = ["canonical_extension_types"] } diff --git a/src/extension/arrow-ext/build.rs b/src/extension/arrow-ext/build.rs index 9ba0b4468b..20174665f5 100644 --- a/src/extension/arrow-ext/build.rs +++ b/src/extension/arrow-ext/build.rs @@ -16,7 +16,8 @@ fn main() { println!("cargo:rustc-env=ARROW_VERSION={version}"); - let arrow_version = resolve_dep_version("arrow-schema").unwrap_or_else(|| "unknown".to_string()); + let arrow_version = + resolve_dep_version("arrow-schema").unwrap_or_else(|| "unknown".to_string()); let parquet_version = resolve_dep_version("parquet").unwrap_or_else(|| "unknown".to_string()); println!("cargo:rustc-env=ARROW_LIB_VERSION={arrow_version}"); @@ -32,7 +33,11 @@ fn resolve_dep_version(crate_name: &str) -> Option { for line in chunk.lines() { let line = line.trim(); if line.starts_with("version = ") { - return Some(line.trim_start_matches("version = ").trim_matches('"').to_string()); + return Some( + line.trim_start_matches("version = ") + .trim_matches('"') + .to_string(), + ); } } } diff --git a/src/extension/arrow-ext/examples/generate_fixtures.rs b/src/extension/arrow-ext/examples/generate_fixtures.rs index 036b3e17ed..b4492eab15 100644 --- a/src/extension/arrow-ext/examples/generate_fixtures.rs +++ b/src/extension/arrow-ext/examples/generate_fixtures.rs @@ -107,7 +107,7 @@ fn generate_all_flat_types() { let col_uint32 = UInt32Array::from(vec![Some(0u32), Some(4294967295u32), None]); let col_uint64 = UInt64Array::from(vec![Some(0u64), Some(u64::MAX), None]); let col_float = Float32Array::from(vec![Some(1.5f32), Some(-2.5f32), None]); - let col_double = Float64Array::from(vec![Some(3.14159f64), Some(-2.71828f64), None]); + let col_double = Float64Array::from(vec![Some(1.23456f64), Some(-7.89012f64), None]); let col_string = StringArray::from(vec![Some("hello"), Some("world"), None]); let col_binary: BinaryArray = vec![ @@ -317,10 +317,7 @@ fn generate_deeply_nested() { let scores_builder = ListBuilder::new(Int64Builder::new()); let struct_builder = StructBuilder::new( struct_fields, - vec![ - Box::new(StringBuilder::new()), - Box::new(scores_builder), - ], + vec![Box::new(StringBuilder::new()), Box::new(scores_builder)], ); let mut groups_builder = ListBuilder::new(struct_builder); diff --git a/src/extension/arrow-ext/src/lib.rs b/src/extension/arrow-ext/src/lib.rs index fa95521e6a..142ba71973 100644 --- a/src/extension/arrow-ext/src/lib.rs +++ b/src/extension/arrow-ext/src/lib.rs @@ -21,6 +21,10 @@ pub extern "C" fn php_module_info(_module: *mut ModuleEntry) { info_table_end!(); } +/// # Safety +/// +/// Invoked by the PHP/Zend engine during module startup. Must only be called by +/// the engine through the registered startup hook, never directly. pub unsafe extern "C" fn module_startup(_type: i32, _module_number: i32) -> i32 { if let Err(e) = stream::output_stream::register() { eprintln!("arrow: failed to register Flow\\Arrow\\OutputStream: {e}"); diff --git a/src/extension/arrow-ext/src/parquet/exception.rs b/src/extension/arrow-ext/src/parquet/exception.rs index 2f7607f147..2bf15073c3 100644 --- a/src/extension/arrow-ext/src/parquet/exception.rs +++ b/src/extension/arrow-ext/src/parquet/exception.rs @@ -17,8 +17,7 @@ pub fn register() -> Result<()> { } fn parquet_exception_ce() -> &'static ClassEntry { - ClassEntry::try_find("Flow\\Arrow\\Parquet\\Exception") - .unwrap_or_else(|| ce::exception()) + ClassEntry::try_find("Flow\\Arrow\\Parquet\\Exception").unwrap_or_else(|| ce::exception()) } pub fn parquet_exception(message: impl Into) -> PhpException { diff --git a/src/extension/arrow-ext/src/parquet/reader.rs b/src/extension/arrow-ext/src/parquet/reader.rs index 4e98b9402a..cc5ca44e93 100644 --- a/src/extension/arrow-ext/src/parquet/reader.rs +++ b/src/extension/arrow-ext/src/parquet/reader.rs @@ -2,7 +2,10 @@ use arrow_schema::DataType; use ext_php_rs::boxed::ZBox; use ext_php_rs::prelude::*; use ext_php_rs::types::{ZendHashTable, ZendObject}; -use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder}; +use parquet::arrow::arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, + ParquetRecordBatchReaderBuilder, +}; use parquet::arrow::ProjectionMask; use crate::parquet::exception::parquet_exception; @@ -11,6 +14,7 @@ use crate::stream::php_source_stream::PhpSourceStream; #[php_class] #[php(name = "Flow\\Arrow\\Parquet\\Reader")] +#[derive(Default)] pub struct Reader { stream: Option, reader_metadata: Option, @@ -21,20 +25,6 @@ pub struct Reader { active_columns: Vec, } -impl Default for Reader { - fn default() -> Self { - Self { - stream: None, - reader_metadata: None, - num_row_groups: 0, - current_row_group: 0, - batch_size: None, - active_batch_reader: None, - active_columns: Vec::new(), - } - } -} - impl Reader { fn ensure_open(&self) -> PhpResult<(&PhpSourceStream, &ArrowReaderMetadata)> { match (&self.stream, &self.reader_metadata) { @@ -82,9 +72,8 @@ impl Reader { columns.iter().map(|_| Vec::new()).collect(); for batch_result in reader { - let batch = batch_result.map_err(|e| { - parquet_exception(format!("Failed to read batch: {}", e)) - })?; + let batch = batch_result + .map_err(|e| parquet_exception(format!("Failed to read batch: {}", e)))?; let schema = batch.schema(); for (idx, col_name) in columns.iter().enumerate() { @@ -115,7 +104,7 @@ impl Reader { .map_err(|_| parquet_exception("Failed to build result array"))?; } - Ok(Some(result.into())) + Ok(Some(result)) } } @@ -224,7 +213,7 @@ impl Reader { source: &mut ZendObject, options: Option<&ZendHashTable>, ) -> PhpResult { - let stream = PhpSourceStream::new(source).map_err(|e| parquet_exception(e))?; + let stream = PhpSourceStream::new(source).map_err(parquet_exception)?; let reader_metadata = ArrowReaderMetadata::load(&stream, ArrowReaderOptions::default()) .map_err(|e| parquet_exception(format!("Failed to open Parquet file: {}", e)))?; @@ -316,10 +305,7 @@ impl Reader { if let Some(ref mut batch_reader) = self.active_batch_reader { match batch_reader.next() { Some(Ok(batch)) => { - return Ok(Some(Self::batch_to_php( - &batch, - &self.active_columns, - )?)); + return Ok(Some(Self::batch_to_php(&batch, &self.active_columns)?)); } Some(Err(e)) => { self.active_batch_reader = None; @@ -375,7 +361,9 @@ impl Reader { .fields() .iter() .position(|f| f.name() == name) - .ok_or_else(|| parquet_exception(format!("Column '{}' not found in schema", name))) + .ok_or_else(|| { + parquet_exception(format!("Column '{}' not found in schema", name)) + }) }) .collect(); let indices = indices?; @@ -387,9 +375,9 @@ impl Reader { builder = builder.with_batch_size(bs); } - let reader = builder.build().map_err(|e| { - parquet_exception(format!("Failed to build batch reader: {}", e)) - })?; + let reader = builder + .build() + .map_err(|e| parquet_exception(format!("Failed to build batch reader: {}", e)))?; self.active_columns = selected_columns; @@ -397,10 +385,7 @@ impl Reader { self.active_batch_reader = Some(reader); continue; } else { - let result = Self::consume_all_batches( - reader, - &self.active_columns, - )?; + let result = Self::consume_all_batches(reader, &self.active_columns)?; self.current_row_group += 1; self.active_columns = Vec::new(); return Ok(result); diff --git a/src/extension/arrow-ext/src/parquet/type_converter.rs b/src/extension/arrow-ext/src/parquet/type_converter.rs index 75df51ee06..f857407595 100644 --- a/src/extension/arrow-ext/src/parquet/type_converter.rs +++ b/src/extension/arrow-ext/src/parquet/type_converter.rs @@ -1,12 +1,15 @@ use arrow_array::builder::{ - BooleanBuilder, Date32Builder, Decimal128Builder, FixedSizeBinaryBuilder, Float32Builder, - Float64Builder, Int16Builder, Int32Builder, Int64Builder, Int8Builder, StringBuilder, - BinaryBuilder, Time64MicrosecondBuilder, TimestampMicrosecondBuilder, UInt16Builder, + BinaryBuilder, BooleanBuilder, Date32Builder, Decimal128Builder, FixedSizeBinaryBuilder, + Float32Builder, Float64Builder, Int16Builder, Int32Builder, Int64Builder, Int8Builder, + StringBuilder, Time64MicrosecondBuilder, TimestampMicrosecondBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder, }; use arrow_array::*; use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit, extension::{Json, Uuid}}; +use arrow_schema::{ + extension::{Json, Uuid}, + DataType, Field, Fields, Schema, TimeUnit, +}; use ext_php_rs::convert::IntoZvalDyn; use ext_php_rs::prelude::*; use ext_php_rs::types::{ArrayKey, ZendHashTable, ZendObject, Zval}; @@ -64,17 +67,12 @@ fn create_date_interval(total_microseconds: i64) -> PhpResult { let mut obj = ZendObject::new(ce); obj.try_call_method("__construct", vec![&spec as &dyn IntoZvalDyn]) - .map_err(|e| { - parquet_exception(format!("Failed to construct DateInterval: {:?}", e)) - })?; + .map_err(|e| parquet_exception(format!("Failed to construct DateInterval: {:?}", e)))?; if remaining_us > 0 { let fraction = remaining_us as f64 / 1_000_000.0; obj.set_property("f", fraction).map_err(|e| { - parquet_exception(format!( - "Failed to set DateInterval microseconds: {:?}", - e - )) + parquet_exception(format!("Failed to set DateInterval microseconds: {:?}", e)) })?; } @@ -101,20 +99,24 @@ fn extract_datetime_timestamp(zv: &Zval, column_name: &str) -> Result<(i64, i64) let ts_zv = obj .try_call_method("getTimestamp", vec![]) .map_err(|e| format!("Column '{}': getTimestamp() failed: {:?}", column_name, e))?; - let seconds = ts_zv - .long() - .ok_or_else(|| format!("Column '{}': getTimestamp() did not return int", column_name))?; + let seconds = ts_zv.long().ok_or_else(|| { + format!( + "Column '{}': getTimestamp() did not return int", + column_name + ) + })?; let format_u = "u".to_string(); let us_zv = obj .try_call_method("format", vec![&format_u as &dyn IntoZvalDyn]) .map_err(|e| format!("Column '{}': format('u') failed: {:?}", column_name, e))?; - let us_str = us_zv - .str() - .ok_or_else(|| format!("Column '{}': format('u') did not return string", column_name))?; - let microseconds: i64 = us_str - .parse() - .unwrap_or(0); + let us_str = us_zv.str().ok_or_else(|| { + format!( + "Column '{}': format('u') did not return string", + column_name + ) + })?; + let microseconds: i64 = us_str.parse().unwrap_or(0); Ok((seconds, microseconds)) } @@ -134,18 +136,10 @@ fn extract_date_interval_microseconds(zv: &Zval, column_name: &str) -> Result("h") - .unwrap_or(0); - let i: i64 = obj - .get_property::("i") - .unwrap_or(0); - let s: i64 = obj - .get_property::("s") - .unwrap_or(0); - let f: f64 = obj - .get_property::("f") - .unwrap_or(0.0); + let h: i64 = obj.get_property::("h").unwrap_or(0); + let i: i64 = obj.get_property::("i").unwrap_or(0); + let s: i64 = obj.get_property::("s").unwrap_or(0); + let f: f64 = obj.get_property::("f").unwrap_or(0.0); Ok(h * 3_600_000_000 + i * 60_000_000 + s * 1_000_000 + (f * 1_000_000.0) as i64) } @@ -332,17 +326,20 @@ fn convert_single_arrow_value(array: &dyn Array, index: usize) -> PhpResult { - let arr = downcast_array::(array, "TimestampMillisecondArray")?; + let arr = + downcast_array::(array, "TimestampMillisecondArray")?; let v = arr.value(index); create_datetime_immutable(v / 1000, (v % 1000) * 1000) } DataType::Timestamp(TimeUnit::Microsecond, _) => { - let arr = downcast_array::(array, "TimestampMicrosecondArray")?; + let arr = + downcast_array::(array, "TimestampMicrosecondArray")?; let v = arr.value(index); create_datetime_immutable(v / 1_000_000, v % 1_000_000) } DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let arr = downcast_array::(array, "TimestampNanosecondArray")?; + let arr = + downcast_array::(array, "TimestampNanosecondArray")?; let v = arr.value(index); create_datetime_immutable(v / 1_000_000_000, (v / 1000) % 1_000_000) } @@ -365,8 +362,12 @@ fn convert_single_arrow_value(array: &dyn Array, index: usize) -> PhpResult { let arr = downcast_array::(array, "Decimal128Array")?; let str_val = arr.value_as_string(index); - let float_val: f64 = str_val.parse() - .map_err(|e| parquet_exception(format!("Failed to parse decimal value '{}': {}", str_val, e)))?; + let float_val: f64 = str_val.parse().map_err(|e| { + parquet_exception(format!( + "Failed to parse decimal value '{}': {}", + str_val, e + )) + })?; let mut zv = Zval::new(); zv.set_double(float_val); Ok(zv) @@ -433,12 +434,10 @@ fn convert_single_arrow_value(array: &dyn Array, index: usize) -> PhpResult { - Err(parquet_exception(format!( - "Unsupported Arrow type in nested conversion: {:?}", - other - ))) - } + other => Err(parquet_exception(format!( + "Unsupported Arrow type in nested conversion: {:?}", + other + ))), } } @@ -568,7 +567,7 @@ pub fn arrow_array_to_php_values(array: &dyn Array, field: Option<&Field>) -> Ph DataType::FixedSizeBinary(_) => { let arr = downcast_array::(array, "FixedSizeBinaryArray")?; - let is_uuid = field.map_or(false, |f| f.try_extension_type::().is_ok()); + let is_uuid = field.is_some_and(|f| f.try_extension_type::().is_ok()); let len = arr.len(); let mut result = Vec::with_capacity(len); for i in 0..len { @@ -608,15 +607,22 @@ pub fn arrow_array_to_php_values(array: &dyn Array, field: Option<&Field>) -> Ph convert_primitive_values(arr, |v| create_datetime_immutable(v, 0)) } DataType::Timestamp(TimeUnit::Millisecond, _) => { - let arr = downcast_array::(array, "TimestampMillisecondArray")?; - convert_primitive_values(arr, |v| create_datetime_immutable(v / 1000, (v % 1000) * 1000)) + let arr = + downcast_array::(array, "TimestampMillisecondArray")?; + convert_primitive_values(arr, |v| { + create_datetime_immutable(v / 1000, (v % 1000) * 1000) + }) } DataType::Timestamp(TimeUnit::Microsecond, _) => { - let arr = downcast_array::(array, "TimestampMicrosecondArray")?; - convert_primitive_values(arr, |v| create_datetime_immutable(v / 1_000_000, v % 1_000_000)) + let arr = + downcast_array::(array, "TimestampMicrosecondArray")?; + convert_primitive_values(arr, |v| { + create_datetime_immutable(v / 1_000_000, v % 1_000_000) + }) } DataType::Timestamp(TimeUnit::Nanosecond, _) => { - let arr = downcast_array::(array, "TimestampNanosecondArray")?; + let arr = + downcast_array::(array, "TimestampNanosecondArray")?; convert_primitive_values(arr, |v| { create_datetime_immutable(v / 1_000_000_000, (v / 1000) % 1_000_000) }) @@ -632,7 +638,7 @@ pub fn arrow_array_to_php_values(array: &dyn Array, field: Option<&Field>) -> Ph } DataType::Time64(TimeUnit::Microsecond) => { let arr = downcast_array::(array, "Time64MicrosecondArray")?; - convert_primitive_values(arr, |v| create_date_interval(v)) + convert_primitive_values(arr, create_date_interval) } DataType::Time64(TimeUnit::Nanosecond) => { let arr = downcast_array::(array, "Time64NanosecondArray")?; @@ -649,8 +655,12 @@ pub fn arrow_array_to_php_values(array: &dyn Array, field: Option<&Field>) -> Ph zv.set_null(); } else { let str_val = arr.value_as_string(i); - let float_val: f64 = str_val.parse() - .map_err(|e| parquet_exception(format!("Failed to parse decimal value '{}': {}", str_val, e)))?; + let float_val: f64 = str_val.parse().map_err(|e| { + parquet_exception(format!( + "Failed to parse decimal value '{}': {}", + str_val, e + )) + })?; zv.set_double(float_val); } result.push(zv); @@ -658,10 +668,7 @@ pub fn arrow_array_to_php_values(array: &dyn Array, field: Option<&Field>) -> Ph Ok(result) } - DataType::List(_) - | DataType::LargeList(_) - | DataType::Struct(_) - | DataType::Map(_, _) => { + DataType::List(_) | DataType::LargeList(_) | DataType::Struct(_) | DataType::Map(_, _) => { let len = array.len(); let mut result = Vec::with_capacity(len); for i in 0..len { @@ -670,12 +677,10 @@ pub fn arrow_array_to_php_values(array: &dyn Array, field: Option<&Field>) -> Ph Ok(result) } - other => { - Err(parquet_exception(format!( - "Unsupported Arrow array type: {:?}", - other - ))) - } + other => Err(parquet_exception(format!( + "Unsupported Arrow array type: {:?}", + other + ))), } } @@ -705,10 +710,10 @@ fn parse_decimal_string(s: &str, scale: i8) -> Result { return Err("Empty decimal string".into()); } - let (negative, digits_str) = if trimmed.starts_with('-') { - (true, &trimmed[1..]) - } else if trimmed.starts_with('+') { - (false, &trimmed[1..]) + let (negative, digits_str) = if let Some(stripped) = trimmed.strip_prefix('-') { + (true, stripped) + } else if let Some(stripped) = trimmed.strip_prefix('+') { + (false, stripped) } else { (false, trimmed) }; @@ -785,7 +790,9 @@ macro_rules! build_float_array { if converted.is_infinite() && !v.is_infinite() { return Err(format!( "Column '{}': value {} overflows {}", - $column_name, v, stringify!($cast_type) + $column_name, + v, + stringify!($cast_type) )); } builder.append_value(converted); @@ -862,15 +869,23 @@ fn php_schema_entry_to_field(entry_ht: &ZendHashTable) -> Result "UUID" => { let mut field = Field::new(&name, DataType::FixedSizeBinary(16), nullable); field.try_with_extension_type(Uuid).map_err(|e| { - format!("Column '{}': failed to set UUID extension type: {}", name, e) + format!( + "Column '{}': failed to set UUID extension type: {}", + name, e + ) })?; return Ok(field); } "JSON" => { let mut field = Field::new(&name, DataType::Utf8, nullable); - field.try_with_extension_type(Json::default()).map_err(|e| { - format!("Column '{}': failed to set JSON extension type: {}", name, e) - })?; + field + .try_with_extension_type(Json::default()) + .map_err(|e| { + format!( + "Column '{}': failed to set JSON extension type: {}", + name, e + ) + })?; return Ok(field); } "FIXED_SIZE_BINARY" => { @@ -889,15 +904,13 @@ fn php_schema_entry_to_field(entry_ht: &ZendHashTable) -> Result let children = entry_ht .get("children") .and_then(|z| z.array()) - .ok_or_else(|| { - format!("Column '{}': LIST type requires 'children' array", name) - })?; + .ok_or_else(|| format!("Column '{}': LIST type requires 'children' array", name))?; let child_zv = children.values().next().ok_or_else(|| { format!("Column '{}': LIST type requires exactly one child", name) })?; - let child_ht = child_zv.array().ok_or_else(|| { - format!("Column '{}': LIST child must be an array", name) - })?; + let child_ht = child_zv + .array() + .ok_or_else(|| format!("Column '{}': LIST child must be an array", name))?; let child_field = php_schema_entry_to_field(child_ht)?; DataType::List(Arc::new(child_field)) } @@ -923,9 +936,7 @@ fn php_schema_entry_to_field(entry_ht: &ZendHashTable) -> Result let children = entry_ht .get("children") .and_then(|z| z.array()) - .ok_or_else(|| { - format!("Column '{}': MAP type requires 'children' array", name) - })?; + .ok_or_else(|| format!("Column '{}': MAP type requires 'children' array", name))?; let mut child_iter = children.values(); let key_zv = child_iter.next().ok_or_else(|| { format!( @@ -939,28 +950,22 @@ fn php_schema_entry_to_field(entry_ht: &ZendHashTable) -> Result name ) })?; - let key_field = php_schema_entry_to_field( - key_zv.array().ok_or_else(|| { + let key_field = + php_schema_entry_to_field(key_zv.array().ok_or_else(|| { format!("Column '{}': MAP key child must be an array", name) - })?, - )?; - let val_field = php_schema_entry_to_field( - val_zv.array().ok_or_else(|| { + })?)?; + let val_field = + php_schema_entry_to_field(val_zv.array().ok_or_else(|| { format!("Column '{}': MAP value child must be an array", name) - })?, - )?; - let entries_struct = - DataType::Struct(Fields::from(vec![key_field, val_field])); + })?)?; + let entries_struct = DataType::Struct(Fields::from(vec![key_field, val_field])); DataType::Map( Arc::new(Field::new("entries", entries_struct, false)), false, ) } other => { - return Err(format!( - "Column '{}': Unsupported type '{}'", - name, other - )); + return Err(format!("Column '{}': Unsupported type '{}'", name, other)); } }; @@ -968,7 +973,7 @@ fn php_schema_entry_to_field(entry_ht: &ZendHashTable) -> Result } pub fn php_schema_to_arrow(schema: &ZendHashTable) -> Result { - if schema.len() == 0 { + if schema.is_empty() { return Err("Schema must have at least one column".into()); } @@ -1020,16 +1025,20 @@ pub fn php_array_to_arrow( if zv.is_null() { builder.append_null(); } else if let Some(v) = zv.long() { - let converted = u64::try_from(v).map_err(|_| { - format!( + let converted = + u64::try_from(v).map_err(|_| { + format!( "Column '{}': value {} out of range for u64 (valid range: {} to {})", column_name, v, u64::MIN, u64::MAX ) - })?; + })?; builder.append_value(converted); } else if let Some(s) = zv.str() { let parsed: u64 = s.parse().map_err(|e| { - format!("Column '{}': failed to parse '{}' as u64: {}", column_name, s, e) + format!( + "Column '{}': failed to parse '{}' as u64: {}", + column_name, s, e + ) })?; builder.append_value(parsed); } else { @@ -1096,10 +1105,7 @@ pub fn php_array_to_arrow( .map(|b| *b as char) .collect(); if hex.len() != 32 { - return Err(format!( - "Column '{}': invalid UUID string", - column_name - )); + return Err(format!("Column '{}': invalid UUID string", column_name)); } let mut uuid_bytes = [0u8; 16]; for i in 0..16 { @@ -1109,12 +1115,14 @@ pub fn php_array_to_arrow( })?; } builder - .append_value(&uuid_bytes) + .append_value(uuid_bytes) .map_err(|e| format!("Column '{}': {}", column_name, e))?; } else { return Err(format!( "Column '{}': expected {} bytes for FIXED_SIZE_BINARY, got {}", - column_name, n, bytes.len() + column_name, + n, + bytes.len() )); } } else { @@ -1276,9 +1284,7 @@ pub fn php_array_to_arrow( for field in fields.iter() { if !field.is_nullable() { let child_val = arr.get(field.name().as_str()); - if child_val.is_none() - || child_val.unwrap().is_null() - { + if child_val.is_none() || child_val.unwrap().is_null() { return Err(format!( "Column '{}': non-nullable field '{}' is missing/null at row {}", column_name, @@ -1291,8 +1297,7 @@ pub fn php_array_to_arrow( } } - let null_bits: Vec = - values.iter().map(|zv| !zv.is_null()).collect(); + let null_bits: Vec = values.iter().map(|zv| !zv.is_null()).collect(); let null_buffer = NullBuffer::from(null_bits); let child_arrays: Result, String> = fields @@ -1326,9 +1331,7 @@ pub fn php_array_to_arrow( } DataType::Map(entries_field, _) => { let (key_field, val_field) = match entries_field.data_type() { - DataType::Struct(fields) if fields.len() == 2 => { - (&fields[0], &fields[1]) - } + DataType::Struct(fields) if fields.len() == 2 => (&fields[0], &fields[1]), _ => { return Err(format!( "Column '{}': MAP entries must be Struct with key and value", @@ -1357,29 +1360,23 @@ pub fn php_array_to_arrow( })?; for (key, val) in arr.iter() { let mut key_zv = Zval::new(); + let set_key = |zv: &mut Zval, s: &str| { + zv.set_string(s, false).map_err(|_| { + format!("Column '{}': failed to set MAP key string", column_name) + }) + }; match key { - ArrayKey::Long(n) => { - key_zv.set_long(n); - } - ArrayKey::String(ref s) => { - key_zv - .set_string(s, false) - .map_err(|_| { - format!( - "Column '{}': failed to set MAP key string", - column_name - ) - })?; - } - ArrayKey::Str(s) => { - key_zv - .set_string(s, false) - .map_err(|_| { - format!( - "Column '{}': failed to set MAP key string", - column_name - ) - })?; + ArrayKey::Long(n) => key_zv.set_long(n), + ArrayKey::String(ref s) => set_key(&mut key_zv, s)?, + ArrayKey::Str(s) => set_key(&mut key_zv, s)?, + ArrayKey::ZendString(zs) => { + let s = zs.as_str().map_err(|_| { + format!( + "Column '{}': failed to read MAP key string", + column_name + ) + })?; + set_key(&mut key_zv, s)?; } } key_zvals.push(key_zv); @@ -1403,11 +1400,7 @@ pub fn php_array_to_arrow( let entries_struct = StructArray::try_new( Fields::from(vec![ - Field::new( - key_field.name(), - key_field.data_type().clone(), - false, - ), + Field::new(key_field.name(), key_field.data_type().clone(), false), Field::new( val_field.name(), val_field.data_type().clone(), diff --git a/src/extension/arrow-ext/src/parquet/writer.rs b/src/extension/arrow-ext/src/parquet/writer.rs index 0226aea64b..be26ae0578 100644 --- a/src/extension/arrow-ext/src/parquet/writer.rs +++ b/src/extension/arrow-ext/src/parquet/writer.rs @@ -16,20 +16,12 @@ use crate::stream::php_destination_stream::PhpDestinationStream; #[php_class] #[php(name = "Flow\\Arrow\\Parquet\\Writer")] +#[derive(Default)] pub struct Writer { writer: Option>, schema: Option>, } -impl Default for Writer { - fn default() -> Self { - Self { - writer: None, - schema: None, - } - } -} - fn parse_compression( s: &str, gzip_level: Option, @@ -183,40 +175,34 @@ impl Writer { compression: Option, options: Option<&ZendHashTable>, ) -> PhpResult { - let dest = PhpDestinationStream::new(stream).map_err(|e| parquet_exception(e))?; + let dest = PhpDestinationStream::new(stream).map_err(parquet_exception)?; let arrow_schema = - type_converter::php_schema_to_arrow(schema).map_err(|e| parquet_exception(e))?; + type_converter::php_schema_to_arrow(schema).map_err(parquet_exception)?; let compression_str = compression.as_deref().unwrap_or("SNAPPY"); let props = if let Some(opts) = options { - if opts.len() > 0 { + if !opts.is_empty() { let builder = WriterProperties::builder(); apply_writer_options(builder, opts, compression_str) - .map_err(|e| parquet_exception(e))? + .map_err(parquet_exception)? .build() } else { let codec = parse_compression(compression_str, None, None, None) - .map_err(|e| parquet_exception(e))?; - WriterProperties::builder() - .set_compression(codec) - .build() + .map_err(parquet_exception)?; + WriterProperties::builder().set_compression(codec).build() } } else { - let codec = parse_compression(compression_str, None, None, None) - .map_err(|e| parquet_exception(e))?; - WriterProperties::builder() - .set_compression(codec) - .build() + let codec = + parse_compression(compression_str, None, None, None).map_err(parquet_exception)?; + WriterProperties::builder().set_compression(codec).build() }; let schema_arc = Arc::new(arrow_schema); - let writer = - ArrowWriter::try_new(dest, schema_arc.clone(), Some(props)).map_err(|e| { - parquet_exception(format!("Failed to create Parquet writer: {}", e)) - })?; + let writer = ArrowWriter::try_new(dest, schema_arc.clone(), Some(props)) + .map_err(|e| parquet_exception(format!("Failed to create Parquet writer: {}", e)))?; Ok(Self { writer: Some(writer), @@ -249,19 +235,19 @@ impl Writer { let values: Vec<&ext_php_rs::types::Zval> = php_ht.values().collect(); - let array = type_converter::php_array_to_arrow(&values, field.data_type(), field.name()) - .map_err(|e| parquet_exception(e))?; + let array = + type_converter::php_array_to_arrow(&values, field.data_type(), field.name()) + .map_err(parquet_exception)?; columns.push(array); } - let record_batch = RecordBatch::try_new(schema, columns).map_err(|e| { - parquet_exception(format!("Failed to create record batch: {}", e)) - })?; + let record_batch = RecordBatch::try_new(schema, columns) + .map_err(|e| parquet_exception(format!("Failed to create record batch: {}", e)))?; - writer.write(&record_batch).map_err(|e| { - parquet_exception(format!("Failed to write batch: {}", e)) - })?; + writer + .write(&record_batch) + .map_err(|e| parquet_exception(format!("Failed to write batch: {}", e)))?; Ok(()) } @@ -274,9 +260,9 @@ impl Writer { self.schema = None; - writer.close().map_err(|e| { - parquet_exception(format!("Failed to close Parquet writer: {}", e)) - })?; + writer + .close() + .map_err(|e| parquet_exception(format!("Failed to close Parquet writer: {}", e)))?; Ok(()) } diff --git a/src/extension/arrow-ext/src/stream/php_destination_stream.rs b/src/extension/arrow-ext/src/stream/php_destination_stream.rs index 078a1ccd47..da8569c012 100644 --- a/src/extension/arrow-ext/src/stream/php_destination_stream.rs +++ b/src/extension/arrow-ext/src/stream/php_destination_stream.rs @@ -17,15 +17,18 @@ unsafe impl Sync for PhpDestinationStream {} impl PhpDestinationStream { pub fn new(obj: &mut ZendObject) -> Result { - let ce = ClassEntry::try_find("Flow\\Arrow\\OutputStream") - .ok_or_else(|| "Interface Flow\\Arrow\\OutputStream not loaded. Did you require the file?".to_string())?; + let ce = ClassEntry::try_find("Flow\\Arrow\\OutputStream").ok_or_else(|| { + "Interface Flow\\Arrow\\OutputStream not loaded. Did you require the file?".to_string() + })?; if !obj.instance_of(ce) { return Err("Destination must implement Flow\\Arrow\\OutputStream".to_string()); } let ptr: *mut ZendObject = obj; unsafe { (*ptr).gc.refcount += 1 }; - Ok(Self { obj: unsafe { ZBox::from_raw(ptr) } }) + Ok(Self { + obj: unsafe { ZBox::from_raw(ptr) }, + }) } } @@ -34,9 +37,10 @@ impl Write for PhpDestinationStream { let mut data = Zval::new(); data.set_binary(buf.to_vec()); - let _result = self.obj + let _result = self + .obj .try_call_method("append", vec![&data as &dyn IntoZvalDyn]) - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("Failed to call append(): {:?}", e)))?; + .map_err(|e| io::Error::other(format!("Failed to call append(): {:?}", e)))?; // OutputStream::append() is all-or-nothing: it returns self on success // or throws on failure (caught above). No partial write path exists. diff --git a/src/extension/arrow-ext/src/stream/php_source_stream.rs b/src/extension/arrow-ext/src/stream/php_source_stream.rs index 573619f215..5d070a3762 100644 --- a/src/extension/arrow-ext/src/stream/php_source_stream.rs +++ b/src/extension/arrow-ext/src/stream/php_source_stream.rs @@ -27,8 +27,10 @@ pub struct PhpSourceStream { impl PhpSourceStream { pub fn new(obj: &mut ZendObject) -> Result { - let ce = ClassEntry::try_find("Flow\\Arrow\\RandomAccessFile") - .ok_or_else(|| "Interface Flow\\Arrow\\RandomAccessFile not loaded. Did you require the file?".to_string())?; + let ce = ClassEntry::try_find("Flow\\Arrow\\RandomAccessFile").ok_or_else(|| { + "Interface Flow\\Arrow\\RandomAccessFile not loaded. Did you require the file?" + .to_string() + })?; if !obj.instance_of(ce) { return Err("Source must implement Flow\\Arrow\\RandomAccessFile".to_string()); } @@ -37,14 +39,16 @@ impl PhpSourceStream { unsafe { (*ptr).gc.refcount += 1 }; let owned = unsafe { ZBox::from_raw(ptr) }; - let zval = owned.try_call_method("size", vec![]) + let zval = owned + .try_call_method("size", vec![]) .map_err(|e| format!("Failed to call size(): {:?}", e))?; if zval.is_null() { return Err("size() returned null; file size is required for Parquet reading".into()); } - let size = zval.long() + let size = zval + .long() .ok_or_else(|| "size() must return an integer".to_string())?; Ok(Self { @@ -79,7 +83,9 @@ impl ChunkReader for PhpSourceStream { let length_arg = length as i64; let offset_arg = start as i64; - let result = self.inner.obj + let result = self + .inner + .obj .try_call_method( "read", vec![ diff --git a/src/extension/arrow-ext/src/stream/random_access_file.rs b/src/extension/arrow-ext/src/stream/random_access_file.rs index 57dbbaebc5..cfd7a5f899 100644 --- a/src/extension/arrow-ext/src/stream/random_access_file.rs +++ b/src/extension/arrow-ext/src/stream/random_access_file.rs @@ -18,8 +18,7 @@ pub fn register() -> Result<()> { MethodFlags::Public | MethodFlags::Abstract, ) .method( - FunctionBuilder::new_abstract("size") - .returns(DataType::Long, false, true), + FunctionBuilder::new_abstract("size").returns(DataType::Long, false, true), MethodFlags::Public | MethodFlags::Abstract, ) .register()?; diff --git a/src/extension/arrow-ext/tests/fixtures/all_flat_types.parquet b/src/extension/arrow-ext/tests/fixtures/all_flat_types.parquet index 7bfba26e4e47815c7fa57995b2100f02a19b1856..ba93367e4341d36463ea58b0a14079b5f46c3532 100644 GIT binary patch delta 130 zcmaE=`c!qpD@J|`qy9e!4}7+N!p}CZx?g_sK}NaF;!Ijh!cc(&k`f#+HJd|N*0E{8 wcoG~iC1Ok(a5V_g$$?zbn_amR+1ZTs3?%;uOg<A diff --git a/src/extension/arrow-ext/tests/fixtures/nested.parquet b/src/extension/arrow-ext/tests/fixtures/nested.parquet index 071dab1d492c328d2e2d01c213b6fcc0f502b115..47a6e60fbcc57be6ad350eca24527337fe22d364 100644 GIT binary patch delta 40 icmew)_(^cXB@P8+Jp;)c83qP4#8A%0z!2aVWC#GgQwIP5 delta 40 icmew)_(^cXB@P7xJp;)c83qP4#8A%0z!2aVWC#Gf^9KC@ diff --git a/src/extension/arrow-ext/tests/fixtures/simple.parquet b/src/extension/arrow-ext/tests/fixtures/simple.parquet index 7396c657051fb3f66e872ac93ac24418831ef292..2d3a7196ba42e4d778630bade769e5eb0ad8a273 100644 GIT binary patch delta 28 icmaFE{DyhM4Mq`TJp)M{83qO*Vi09yU- diff --git a/src/extension/arrow-ext/tests/phpt/007_reader_all_flat_types.phpt b/src/extension/arrow-ext/tests/phpt/007_reader_all_flat_types.phpt index c173c4cdf5..e14c36ccba 100644 --- a/src/extension/arrow-ext/tests/phpt/007_reader_all_flat_types.phpt +++ b/src/extension/arrow-ext/tests/phpt/007_reader_all_flat_types.phpt @@ -63,7 +63,7 @@ echo "float: "; var_dump(is_float($data['col_float'][0]) && is_float($data['col_float'][1]) && $data['col_float'][2] === null); echo "double: "; -var_dump($data['col_double'][0] === 3.14159 && $data['col_double'][1] === -2.71828 && $data['col_double'][2] === null); +var_dump($data['col_double'][0] === 1.23456 && $data['col_double'][1] === -7.89012 && $data['col_double'][2] === null); // String echo "string: "; From 15a8e16e0e889c05e11cb633e6a0a06c1b9114a7 Mon Sep 17 00:00:00 2001 From: Norbert Orzechowicz Date: Thu, 4 Jun 2026 17:37:53 +0200 Subject: [PATCH 2/2] fix(flow-php/arrow-ext): pin macOS clang to brew llvm for ext-php-rs build --- .github/workflows/job-arrow-extension.yml | 10 +++++++++- documentation/contributing/c.md | 8 ++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/job-arrow-extension.yml b/.github/workflows/job-arrow-extension.yml index 3fc48abfde..d394c8cd7b 100644 --- a/.github/workflows/job-arrow-extension.yml +++ b/.github/workflows/job-arrow-extension.yml @@ -58,6 +58,11 @@ jobs: run: | brew install llvm echo "LIBCLANG_PATH=$(brew --prefix llvm)/lib" >> "$GITHUB_ENV" + # Pin the clang executable to the one shipped with the same llvm as + # libclang. Otherwise clang-sys (used by ext-php-rs's bindgen) falls + # back to `xcodebuild -find clang` and mixes Xcode's clang headers + # (e.g. arm_neon.h) with brew's newer libclang, which fails to compile. + echo "CLANG_PATH=$(brew --prefix llvm)/bin/clang" >> "$GITHUB_ENV" - name: Set LIBCLANG_PATH (Ubuntu) if: runner.os == 'Linux' @@ -147,6 +152,9 @@ jobs: run: | brew install autoconf automake libtool llvm echo "LIBCLANG_PATH=$(brew --prefix llvm)/lib" >> "$GITHUB_ENV" + # See the build job: keep clang and libclang from the same llvm so + # bindgen does not mix brew's libclang with Xcode's clang headers. + echo "CLANG_PATH=$(brew --prefix llvm)/bin/clang" >> "$GITHUB_ENV" - name: Set LIBCLANG_PATH (Ubuntu) if: runner.os == 'Linux' @@ -180,7 +188,7 @@ jobs: run: | sudo pie repository:remove packagist.org sudo pie repository:add path ${{ github.workspace }}/src/extension/arrow-ext - sudo env "PATH=$PATH" "LIBCLANG_PATH=$LIBCLANG_PATH" "RUSTUP_TOOLCHAIN=$RUSTUP_TOOLCHAIN" "CARGO_HOME=$CARGO_HOME" "RUSTUP_HOME=$RUSTUP_HOME" pie install flow-php/arrow-ext:0.0.9999@dev + sudo env "PATH=$PATH" "LIBCLANG_PATH=$LIBCLANG_PATH" ${CLANG_PATH:+"CLANG_PATH=$CLANG_PATH"} "RUSTUP_TOOLCHAIN=$RUSTUP_TOOLCHAIN" "CARGO_HOME=$CARGO_HOME" "RUSTUP_HOME=$RUSTUP_HOME" pie install flow-php/arrow-ext:0.0.9999@dev - name: Verify extension is loaded run: | diff --git a/documentation/contributing/c.md b/documentation/contributing/c.md index 7360a12236..ebc1ed95d3 100644 --- a/documentation/contributing/c.md +++ b/documentation/contributing/c.md @@ -87,10 +87,10 @@ nix-shell --arg with-pg-query-ext false --arg with-c true --run "cd src/extensio The extension is only half of the story. A given `libpg_query` version pins a specific PostgreSQL **grammar**, and that grammar is shared across three places that must always agree: -| Place | What it is | Used by | -|-------|------------|---------| -| `src/extension/pg-query-ext/vendor/libpg_query` (Makefile `PG_VERSION`) | the C library compiled into the extension | CI (`pie` build) | -| `.nix/pkgs/php-pg-query-ext/package.nix` (`libpg_query` `version`/`rev`/`hash`) | the C library compiled into the **nix dev shell** | local `nix-shell` | +| Place | What it is | Used by | +|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|-----------------------------------| +| `src/extension/pg-query-ext/vendor/libpg_query` (Makefile `PG_VERSION`) | the C library compiled into the extension | CI (`pie` build) | +| `.nix/pkgs/php-pg-query-ext/package.nix` (`libpg_query` `version`/`rev`/`hash`) | the C library compiled into the **nix dev shell** | local `nix-shell` | | `src/lib/postgresql/resources/proto/pg_query.proto` + the generated stubs in `src/lib/postgresql/src/Flow/PostgreSql/Protobuf/` | the protobuf schema the PHP side serializes/deserializes parse trees with | the `flow-php/postgresql` library | The PHP library round-trips parse trees as protobuf (`pg_query_parse_protobuf` → mutate → `pg_query_deparse`).