From 64fe87edaa02dfdb128810e5eae782105a8115b1 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 2 Mar 2025 14:47:17 -0500 Subject: [PATCH 1/7] first commit cleanup --- optd-datafusion/examples/demo.rs | 28 ++++ optd-datafusion/src/converter/from_optd.rs | 4 +- optd-datafusion/src/converter/into_optd.rs | 4 +- optd-datafusion/src/converter/mod.rs | 4 +- optd-datafusion/src/lib.rs | 141 +++++++-------------- optd-datafusion/src/optd_utils.rs | 54 ++++++++ 6 files changed, 131 insertions(+), 104 deletions(-) create mode 100644 optd-datafusion/examples/demo.rs create mode 100644 optd-datafusion/src/optd_utils.rs diff --git a/optd-datafusion/examples/demo.rs b/optd-datafusion/examples/demo.rs new file mode 100644 index 0000000..4fb14f5 --- /dev/null +++ b/optd-datafusion/examples/demo.rs @@ -0,0 +1,28 @@ +use std::env; +use std::error::Error; +use std::fs; + +use optd_datafusion::run_queries; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + + if args.len() < 2 { + eprintln!("Usage: {} .sql", args[0]); + return Ok(()); + } + + let file_path = &args[1]; + let file = fs::read_to_string(file_path)?; + + // Retrieve all of the SQL queries from the file. + let queries: Vec<&str> = file + .split(';') + .filter(|query| !query.trim().is_empty()) + .collect(); + + run_queries(&queries).await?; + + Ok(()) +} diff --git a/optd-datafusion/src/converter/from_optd.rs b/optd-datafusion/src/converter/from_optd.rs index e088a8c..ea64353 100644 --- a/optd-datafusion/src/converter/from_optd.rs +++ b/optd-datafusion/src/converter/from_optd.rs @@ -25,7 +25,7 @@ use super::OptdDFContext; impl OptdDFContext<'_> { #[async_recursion] - pub async fn conv_optd_to_df_relational( + pub(crate) async fn conv_optd_to_df_relational( &self, optimized_plan: &PhysicalPlan, ) -> anyhow::Result> { @@ -127,7 +127,7 @@ impl OptdDFContext<'_> { } } - pub fn conv_optd_to_df_scalar( + pub(crate) fn conv_optd_to_df_scalar( pred: &ScalarPlan, context: &SchemaRef, ) -> anyhow::Result> { diff --git a/optd-datafusion/src/converter/into_optd.rs b/optd-datafusion/src/converter/into_optd.rs index 8460672..bceff54 100644 --- a/optd-datafusion/src/converter/into_optd.rs +++ b/optd-datafusion/src/converter/into_optd.rs @@ -26,7 +26,7 @@ use super::OptdDFContext; impl OptdDFContext<'_> { /// The col_offset is an offset added to the column index for all column references. It is useful for joins. - pub fn conv_df_to_optd_scalar( + pub(crate) fn conv_df_to_optd_scalar( df_expr: &Expr, context: &DFSchema, col_offset: usize, @@ -97,7 +97,7 @@ impl OptdDFContext<'_> { } } - pub fn conv_df_to_optd_relational( + pub(crate) fn conv_df_to_optd_relational( &mut self, df_logical_plan: &DFLogicalPlan, ) -> anyhow::Result> { diff --git a/optd-datafusion/src/converter/mod.rs b/optd-datafusion/src/converter/mod.rs index d3b3e6c..6349f51 100644 --- a/optd-datafusion/src/converter/mod.rs +++ b/optd-datafusion/src/converter/mod.rs @@ -7,7 +7,7 @@ pub mod into_optd; /// A context for converting between optd and datafusion. /// The map is used to lookup table sources when converting TableScan operators from optd to datafusion. -pub struct OptdDFContext<'a> { +pub(crate) struct OptdDFContext<'a> { /// Maps table names to table sources. pub tables: HashMap>, pub session_state: &'a SessionState, @@ -23,7 +23,7 @@ impl OptdDFContext<'_> { /// # Returns /// /// A `OptdDFContext` containing an empty table map and the provided session state. - pub fn new(session_state: &SessionState) -> OptdDFContext { + pub(crate) fn new(session_state: &SessionState) -> OptdDFContext { OptdDFContext { tables: HashMap::new(), session_state, diff --git a/optd-datafusion/src/lib.rs b/optd-datafusion/src/lib.rs index 7193224..bc33ca2 100644 --- a/optd-datafusion/src/lib.rs +++ b/optd-datafusion/src/lib.rs @@ -3,132 +3,77 @@ // Use of this source code is governed by an MIT-style license that can be found in the LICENSE file or at // https://opensource.org/licenses/MIT. -#![allow(clippy::new_without_default)] -use std::sync::Arc; - -use datafusion::catalog::{CatalogProviderList, MemoryCatalogProviderList}; -use datafusion::common::Result; -use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; -use datafusion::execution::SessionStateBuilder; -use datafusion::prelude::{SessionConfig, SessionContext}; -use planner::OptdOptimizer; -use planner::OptdQueryPlanner; - use datafusion::arrow::array::RecordBatch; use datafusion::arrow::util::pretty; -use datafusion::physical_plan::ExecutionPlanProperties; -use datafusion::physical_plan::Partitioning; +use datafusion::common::Result; +use datafusion::physical_plan::{ExecutionPlanProperties, Partitioning}; +use datafusion::prelude::*; use futures::StreamExt; use std::time::SystemTime; -pub mod converter; -pub mod planner; -pub async fn run_queries(queries: String) -> Result<()> { - // Create a SessionContext with TPCH base tables +mod converter; +mod optd_utils; +mod planner; - let session_config = SessionConfig::from_env()?.with_information_schema(true); +/// Given a string of SQL queries, run them +pub async fn run_queries(queries: &[&str]) -> Result<()> { + // Create a default DataFusion `SessionConfig`. + let session_config = SessionConfig::new().with_information_schema(true); - let ctx = crate::create_df_context(Some(session_config.clone()), None, None) + // Create a DataFusion `SessionContext` that uses the `optd` optimizer to help created optimized + // `ExecutionPlan`s. + let ctx = optd_utils::create_optd_session(Some(session_config), None, None) .await .unwrap(); - // Create a DataFrame with the input query - for query in queries.split(';') { - if query.trim().is_empty() { - continue; - } + // For each query, create and optimize a physical `ExecutionPlan` and then run it. + for (i, query) in queries.iter().enumerate() { let sql = ctx.sql(query).await?; - // Run our execution engine on the physical plan - let df_physical_plan = sql.clone().create_physical_plan().await?; - let plan = df_physical_plan.clone(); - // println!("{:#?}", df_physical_plan.clone()); - // let df_physical_plan = df_physical_plan.children()[0].clone(); - let mut print_results: Vec = vec![]; + + // Start a timer to record optimization + execution time. let now = SystemTime::now(); - // DataFusion execution nodes will output multiple streams that are partitioned by the following - // patterns, so just join them all into one stream + // Note that `create_physical_plan` here will call the `optd` optimizer. + let plan = sql.create_physical_plan().await?; + + // DataFusion execution nodes will output multiple streams that are partitioned by the + // following patterns, so just join them all into one stream for now. let partitions = match plan.output_partitioning() { Partitioning::RoundRobinBatch(c) => *c, Partitioning::Hash(_, h) => *h, Partitioning::UnknownPartitioning(p) => *p, }; - // In a separate tokio task, send batches to the next operator over the `tx` channel, and make - // sure to make use of all of the partitions + // Queue the record batches so that print time does not affect the execution time record. + let mut record_batches: Vec = vec![]; + + // In a separate tokio task, send batches to the next operator over the `tx` channel, and + // make sure to use all of the partitions. for i in 0..partitions { - let batch_stream = plan.execute(i, Default::default()).unwrap(); + // Can't make this `Result>` because `DataFusionError` is not `Default`. + let result_batches: Vec> = + plan.execute(i, Default::default())?.collect().await; - let results: Vec<_> = batch_stream.collect().await; - for batch in results { - let batch = batch.unwrap(); + for batch in result_batches { + let batch = batch?; if batch.num_rows() == 0 { continue; } - print_results.push(batch); + record_batches.push(batch); } } - match now.elapsed() { - Ok(elapsed) => { - // it prints '2' - println!("Datafusion time in milliseconds: {}", elapsed.as_millis()); - } - Err(e) => { - // an error occurred! - println!("Error: {e:?}"); - } - } + let elapsed = now.elapsed().expect("Failed to get elapsed time"); - print_results.into_iter().for_each(|batch| { - let pretty_results = pretty::pretty_format_batches(&[batch]).unwrap().to_string(); - println!("{}", pretty_results); - }); + // Pretty print the results. + let query_results = pretty::pretty_format_batches(&record_batches) + .expect("Unable to format query reuslts") + .to_string(); + + println!("\n\nQuery {i} Results:"); + println!("Execution time in Milliseconds: {}", elapsed.as_millis()); + println!("Query Results:\n{query_results}\n\n"); } - Ok(()) -} - -/// Utility function to create a session context for datafusion + optd. -pub async fn create_df_context( - session_config: Option, - runtime_env: Option>, - catalog: Option>, -) -> anyhow::Result { - let mut session_config = match session_config { - Some(config) => config, - None => SessionConfig::from_env()?.with_information_schema(true), - }; - - // Disable Datafusion's heuristic rule based query optimizer - session_config.options_mut().optimizer.max_passes = 0; - - let runtime_env = match runtime_env { - Some(runtime_env) => runtime_env, - None => Arc::new(RuntimeEnvBuilder::new().build()?), - }; - let catalog = match catalog { - Some(catalog) => catalog, - None => Arc::new(MemoryCatalogProviderList::new()), - }; - - let mut builder = SessionStateBuilder::new() - .with_config(session_config) - .with_runtime_env(runtime_env) - .with_catalog_list(catalog.clone()) - .with_default_features(); - - let optimizer = OptdOptimizer::new_in_memory().await?; - let planner = Arc::new(OptdQueryPlanner::new(optimizer)); - // clean up optimizer rules so that we can plug in our own optimizer - builder = builder.with_optimizer_rules(vec![]); - builder = builder.with_physical_optimizer_rules(vec![]); - - // use optd-bridge query planner - builder = builder.with_query_planner(planner); - - let state = builder.build(); - let ctx = SessionContext::new_with_state(state).enable_url_table(); - ctx.refresh_catalogs().await?; - Ok(ctx) + Ok(()) } diff --git a/optd-datafusion/src/optd_utils.rs b/optd-datafusion/src/optd_utils.rs new file mode 100644 index 0000000..f9b4429 --- /dev/null +++ b/optd-datafusion/src/optd_utils.rs @@ -0,0 +1,54 @@ +use datafusion::catalog::{CatalogProviderList, MemoryCatalogProviderList}; +use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; +use datafusion::execution::SessionStateBuilder; +use datafusion::prelude::{SessionConfig, SessionContext}; +use std::sync::Arc; + +use crate::planner::OptdOptimizer; +use crate::planner::OptdQueryPlanner; + +/// Utility function to create a session context for datafusion + optd. +/// TODO docs. +pub(crate) async fn create_optd_session( + session_config: Option, + runtime_env: Option>, + datafusion_catalog: Option>, +) -> anyhow::Result { + let mut session_config = match session_config { + Some(config) => config, + None => SessionConfig::from_env()?.with_information_schema(true), + }; + + // Disable Datafusion's heuristic rule based query optimizer + session_config.options_mut().optimizer.max_passes = 0; + + let runtime_env = match runtime_env { + Some(runtime_env) => runtime_env, + None => Arc::new(RuntimeEnvBuilder::new().build()?), + }; + + let catalog = match datafusion_catalog { + Some(catalog) => catalog, + None => Arc::new(MemoryCatalogProviderList::new()), + }; + + let mut builder = SessionStateBuilder::new() + .with_config(session_config) + .with_runtime_env(runtime_env) + .with_catalog_list(catalog.clone()) + .with_default_features(); + + let optimizer = OptdOptimizer::new_in_memory().await?; + let planner = Arc::new(OptdQueryPlanner::new(optimizer)); + // clean up optimizer rules so that we can plug in our own optimizer + builder = builder.with_optimizer_rules(vec![]); + builder = builder.with_physical_optimizer_rules(vec![]); + + // use optd-bridge query planner + builder = builder.with_query_planner(planner); + + let state = builder.build(); + let ctx = SessionContext::new_with_state(state).enable_url_table(); + ctx.refresh_catalogs().await?; + Ok(ctx) +} From 4a2d7d4c7b839e556aff96685fb3e32be692aefe Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 2 Mar 2025 15:05:48 -0500 Subject: [PATCH 2/7] cleanup utils + format --- optd-datafusion/sql/test_filter.sql | 37 ++++++++------ optd-datafusion/sql/test_join.sql | 59 +++++++++++++++++------ optd-datafusion/sql/test_scan.sql | 31 ++++++------ optd-datafusion/src/lib.rs | 5 +- optd-datafusion/src/optd_utils.rs | 75 ++++++++++++++++------------- optd-datafusion/src/planner.rs | 6 +-- 6 files changed, 131 insertions(+), 82 deletions(-) diff --git a/optd-datafusion/sql/test_filter.sql b/optd-datafusion/sql/test_filter.sql index 47bd7d1..f1e4ffd 100644 --- a/optd-datafusion/sql/test_filter.sql +++ b/optd-datafusion/sql/test_filter.sql @@ -1,24 +1,33 @@ -CREATE TABLE employees ( - id BIGINT, - name TEXT, - department_id BIGINT -); +CREATE TABLE employees (id BIGINT, name TEXT, department_id BIGINT); -CREATE TABLE departments ( - id BIGINT, - department_name TEXT -); +CREATE TABLE departments (id BIGINT, department_name TEXT); -INSERT INTO employees VALUES +INSERT INTO + employees +VALUES (1, 'Alice', 1), (2, 'Bob', 2), (3, 'Charlie', 1); -INSERT INTO departments VALUES +INSERT INTO + departments +VALUES (1, 'Engineering'), (2, 'Marketing'); +explain +SELECT + * +FROM + employees +WHERE + id = 2 + 1 - 1 + and name = 'Bob'; -explain SELECT * FROM employees WHERE id = 2 + 1 - 1 and name = 'Bob'; - -SELECT * FROM employees WHERE id = 2 + 1 - 1 and name = 'Bob'; +SELECT + * +FROM + employees +WHERE + id = 2 + 1 - 1 + and name = 'Bob'; diff --git a/optd-datafusion/sql/test_join.sql b/optd-datafusion/sql/test_join.sql index d908067..17a689d 100644 --- a/optd-datafusion/sql/test_join.sql +++ b/optd-datafusion/sql/test_join.sql @@ -1,24 +1,55 @@ -CREATE TABLE employees ( - id INTEGER, - name TEXT, - department_id INTEGER -); +CREATE TABLE employees (id INTEGER, name TEXT, department_id INTEGER); -CREATE TABLE departments ( - id INTEGER, - department_name TEXT -); +CREATE TABLE departments (id INTEGER, department_name TEXT); -INSERT INTO employees VALUES +INSERT INTO + employees +VALUES (1, 'Alice', 1), (2, 'Bob', 2), (3, 'Charlie', 1); -INSERT INTO departments VALUES +INSERT INTO + departments +VALUES (1, 'Engineering'), (2, 'Marketing'); +explain +SELECT + * +FROM + employees + INNER JOIN departments ON employees.department_id = departments.id +where + ( + NOT ( + employees.name = 'Bob' + AND departments.department_name = 'Engineering' + ) + ) + AND ( + NOT ( + employees.name = 'Bob' + AND departments.department_name = 'Engineering' + ) + ); -explain SELECT * FROM employees INNER JOIN departments ON employees.department_id = departments.id where (NOT (employees.name = 'Bob' AND departments.department_name = 'Engineering')) AND (NOT (employees.name = 'Bob' AND departments.department_name = 'Engineering')); - -SELECT * FROM employees INNER JOIN departments ON employees.department_id = departments.id where (NOT (employees.name = 'Bob' AND departments.department_name = 'Engineering')) AND (NOT (employees.name = 'Bob' AND departments.department_name = 'Engineering')); \ No newline at end of file +SELECT + * +FROM + employees + INNER JOIN departments ON employees.department_id = departments.id +where + ( + NOT ( + employees.name = 'Bob' + AND departments.department_name = 'Engineering' + ) + ) + AND ( + NOT ( + employees.name = 'Bob' + AND departments.department_name = 'Engineering' + ) + ); diff --git a/optd-datafusion/sql/test_scan.sql b/optd-datafusion/sql/test_scan.sql index 44533b2..bf9567b 100644 --- a/optd-datafusion/sql/test_scan.sql +++ b/optd-datafusion/sql/test_scan.sql @@ -1,24 +1,27 @@ -CREATE TABLE employees ( - id INTEGER, - name TEXT, - department_id INTEGER -); +CREATE TABLE employees (id INTEGER, name TEXT, department_id INTEGER); -CREATE TABLE departments ( - id INTEGER, - department_name TEXT -); +CREATE TABLE departments (id INTEGER, department_name TEXT); -INSERT INTO employees VALUES +INSERT INTO + employees +VALUES (1, 'Alice', 1), (2, 'Bob', 2), (3, 'Charlie', 1); -INSERT INTO departments VALUES +INSERT INTO + departments +VALUES (1, 'Engineering'), (2, 'Marketing'); +explain +SELECT + * +FROM + employees; -explain SELECT * FROM employees; - -SELECT * FROM employees; +SELECT + * +FROM + employees; diff --git a/optd-datafusion/src/lib.rs b/optd-datafusion/src/lib.rs index bc33ca2..9f6c5fd 100644 --- a/optd-datafusion/src/lib.rs +++ b/optd-datafusion/src/lib.rs @@ -69,8 +69,9 @@ pub async fn run_queries(queries: &[&str]) -> Result<()> { let query_results = pretty::pretty_format_batches(&record_batches) .expect("Unable to format query reuslts") .to_string(); - - println!("\n\nQuery {i} Results:"); + + println!("\n\nQuery {i}:"); + println!("{query}\n"); println!("Execution time in Milliseconds: {}", elapsed.as_millis()); println!("Query Results:\n{query_results}\n\n"); } diff --git a/optd-datafusion/src/optd_utils.rs b/optd-datafusion/src/optd_utils.rs index f9b4429..006d649 100644 --- a/optd-datafusion/src/optd_utils.rs +++ b/optd-datafusion/src/optd_utils.rs @@ -1,54 +1,61 @@ +use crate::planner::OptdOptimizer; +use crate::planner::OptdQueryPlanner; use datafusion::catalog::{CatalogProviderList, MemoryCatalogProviderList}; +use datafusion::common::Result; use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; use datafusion::execution::SessionStateBuilder; use datafusion::prelude::{SessionConfig, SessionContext}; use std::sync::Arc; -use crate::planner::OptdOptimizer; -use crate::planner::OptdQueryPlanner; - -/// Utility function to create a session context for datafusion + optd. -/// TODO docs. +/// Creates a DataFusion `SessionContext` with the given optional parameters that uses `optd` as the +/// query planner and disables any optimizations that DataFusion itself performs. pub(crate) async fn create_optd_session( session_config: Option, runtime_env: Option>, datafusion_catalog: Option>, -) -> anyhow::Result { - let mut session_config = match session_config { - Some(config) => config, - None => SessionConfig::from_env()?.with_information_schema(true), +) -> Result { + // Use the provided session configuration or create one from the environment variables. + let session_config = { + let mut config = session_config + .unwrap_or_else(|| { + SessionConfig::from_env().expect("Failed to create session config from env") + }) + .with_information_schema(true); + + // Disable Datafusion's heuristic rule-based optimizer by setting the passes to 1. + config.options_mut().optimizer.max_passes = 0; + config }; - // Disable Datafusion's heuristic rule based query optimizer - session_config.options_mut().optimizer.max_passes = 0; + // Use the provided runtime environment or create the default one. + let runtime_env = + runtime_env.unwrap_or_else(|| Arc::new(RuntimeEnvBuilder::new().build().unwrap())); - let runtime_env = match runtime_env { - Some(runtime_env) => runtime_env, - None => Arc::new(RuntimeEnvBuilder::new().build()?), - }; + // Use the provided catalog or create a default one. + let datafusion_catalog = + datafusion_catalog.unwrap_or_else(|| Arc::new(MemoryCatalogProviderList::new())); - let catalog = match datafusion_catalog { - Some(catalog) => catalog, - None => Arc::new(MemoryCatalogProviderList::new()), - }; + // Use the `optd` optimizer as the query planner instead of the default one. + let optimizer = OptdOptimizer::new_in_memory() + .await + .expect("TODO FIX ERROR HANDLING"); + let planner = Arc::new(OptdQueryPlanner::new(optimizer)); - let mut builder = SessionStateBuilder::new() + // Build up the state for the `SessionContext`. Removes all optimizer rules so that it + // completely relies on `optd`. + let session_state = SessionStateBuilder::new() .with_config(session_config) .with_runtime_env(runtime_env) - .with_catalog_list(catalog.clone()) - .with_default_features(); - - let optimizer = OptdOptimizer::new_in_memory().await?; - let planner = Arc::new(OptdQueryPlanner::new(optimizer)); - // clean up optimizer rules so that we can plug in our own optimizer - builder = builder.with_optimizer_rules(vec![]); - builder = builder.with_physical_optimizer_rules(vec![]); - - // use optd-bridge query planner - builder = builder.with_query_planner(planner); - - let state = builder.build(); - let ctx = SessionContext::new_with_state(state).enable_url_table(); + .with_catalog_list(datafusion_catalog.clone()) + .with_default_features() + .with_optimizer_rules(vec![]) + .with_physical_optimizer_rules(vec![]) + .with_query_planner(planner) + .build(); + + // Create the `SessionContext` and refresh the catalogs to ensure everything is up-to-date. + let ctx = SessionContext::new_with_state(session_state).enable_url_table(); ctx.refresh_catalogs().await?; + Ok(ctx) } diff --git a/optd-datafusion/src/planner.rs b/optd-datafusion/src/planner.rs index 450d79f..5e3895d 100644 --- a/optd-datafusion/src/planner.rs +++ b/optd-datafusion/src/planner.rs @@ -1,5 +1,4 @@ -use std::sync::Arc; - +use crate::converter::OptdDFContext; use anyhow::Ok; use async_trait::async_trait; use datafusion::{ @@ -14,8 +13,7 @@ use optd_core::{ plans::{logical::LogicalPlan, physical::PhysicalPlan}, storage::memo::SqliteMemo, }; - -use crate::converter::OptdDFContext; +use std::sync::Arc; /// A mock optimizer for testing purposes. #[derive(Debug)] From b16749c2185a38d3d3c17d44e571d3404c72215d Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 2 Mar 2025 15:51:59 -0500 Subject: [PATCH 3/7] clean up mock optimizer --- optd-datafusion/src/lib.rs | 2 +- optd-datafusion/src/mock.rs | 148 ++++++++++++++++++++ optd-datafusion/src/optd_utils.rs | 10 +- optd-datafusion/src/planner.rs | 215 ------------------------------ 4 files changed, 153 insertions(+), 222 deletions(-) create mode 100644 optd-datafusion/src/mock.rs delete mode 100644 optd-datafusion/src/planner.rs diff --git a/optd-datafusion/src/lib.rs b/optd-datafusion/src/lib.rs index 9f6c5fd..c91187b 100644 --- a/optd-datafusion/src/lib.rs +++ b/optd-datafusion/src/lib.rs @@ -12,8 +12,8 @@ use futures::StreamExt; use std::time::SystemTime; mod converter; +mod mock; mod optd_utils; -mod planner; /// Given a string of SQL queries, run them pub async fn run_queries(queries: &[&str]) -> Result<()> { diff --git a/optd-datafusion/src/mock.rs b/optd-datafusion/src/mock.rs new file mode 100644 index 0000000..f9e245b --- /dev/null +++ b/optd-datafusion/src/mock.rs @@ -0,0 +1,148 @@ +use crate::converter::OptdDFContext; +use async_trait::async_trait; +use datafusion::{ + common::Result as DataFusionResult, + execution::{context::QueryPlanner, SessionState}, + logical_expr::{ + Explain, LogicalPlan as DataFusionLogicalPlan, PlanType as DataFusionPlanType, + ToStringifiedPlan, + }, + physical_plan::{displayable, explain::ExplainExec, ExecutionPlan}, + physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, +}; +use optd_core::{ + cascades, + plans::{logical::LogicalPlan, physical::PhysicalPlan}, + storage::memo::SqliteMemo, +}; +use std::sync::Arc; + +/// A mock `optd` optimizer. +#[derive(Debug)] +pub(crate) struct MockOptdOptimizer { + /// The memo table used for dynamic programming during query optimization. + memo: SqliteMemo, +} + +impl MockOptdOptimizer { + /// Creates a new `optd` optimizer with an in-memory memo table. + pub async fn new_in_memory() -> anyhow::Result { + Ok(Self { + memo: SqliteMemo::new_in_memory().await?, + }) + } + + /// A mock optimization function for testing purposes. + /// + /// This function takes a [`LogicalPlan`], and for each node in the [`LogicalPlan`], it will + /// recursively traverse the node and its children and replace the node with a physical + /// operator. The physical operator is chosen based on the type of the logical operator. + /// + /// For example, if the logical operator is a scan, the physical operator will be a `TableScan`. + /// If the logical operator is a filter, the physical operator will be a `Filter`, and so on. + /// + /// The physical operators are chosen in a way that they mirror the structure of the logical + /// plan, but they are not actually optimized in any way. This is useful for testing purposes, + /// as it allows us to test the structure of the physical plan without having to worry about the + /// actual optimization process. + /// + /// This function returns a [`PhysicalPlan`], which is an `optd` struct that contains the root + /// node of the physical plan. + pub async fn mock_optimize( + &self, + logical_plan: &LogicalPlan, + ) -> anyhow::Result> { + let root_group_id = cascades::ingest_full_logical_plan(&self.memo, logical_plan).await?; + let goal_id = cascades::mock_optimize_relation_group(&self.memo, root_group_id).await?; + let optimized_plan = cascades::match_any_physical_plan(&self.memo, goal_id).await?; + + Ok(optimized_plan) + } +} + +#[async_trait] +impl QueryPlanner for MockOptdOptimizer { + /// This function is the entry point for the physical planner. It will attempt to optimize the + /// given DataFusion [`DataFusionLogicalPlan`] using the `optd` optimizer. + /// + /// If the [`DataFusionLogicalPlan`] is a DML/DDL operation, it will fall back to the DataFusion planner. + /// + /// Otherwise, this function will convert the DataFusion [`DataFusionLogicalPlan`] into an + /// `optd` [`LogicalPlan`] in order to pass it to the `optd` optimizer. + /// + /// Once `optd` has finished optimization, it will convert the output `optd` [`PhysicalPlan`] + /// into a physical plan that can be executed by DataFusion ([`ExecutionPlan`]). + /// + /// # Arguments + /// * `logical_plan` - The logical plan in DataFusion's type system to optimize. + /// * `session_state` - The session state to use for creating the physical plan. + /// + /// + /// # Returns + /// * `anyhow::Result>` - The physical plan that can be executed by + /// DataFusion. + async fn create_physical_plan( + &self, + datafusion_logical_plan: &DataFusionLogicalPlan, + session_state: &SessionState, + ) -> DataFusionResult> { + // Fallback to the default DataFusion planner for DML/DDL operations. + if let DataFusionLogicalPlan::Dml(_) + | DataFusionLogicalPlan::Ddl(_) + | DataFusionLogicalPlan::EmptyRelation(_) = datafusion_logical_plan + { + return DefaultPhysicalPlanner::default() + .create_physical_plan(datafusion_logical_plan, session_state) + .await; + } + + let (datafusion_logical_plan, _verbose, mut explains) = match datafusion_logical_plan { + DataFusionLogicalPlan::Explain(Explain { plan, verbose, .. }) => { + (plan.as_ref(), *verbose, Some(Vec::new())) + } + _ => (datafusion_logical_plan, false, None), + }; + + if let Some(explains) = &mut explains { + explains.push(datafusion_logical_plan.to_stringified( + DataFusionPlanType::OptimizedLogicalPlan { + optimizer_name: "datafusion".to_string(), + }, + )); + } + + let mut converter = OptdDFContext::new(session_state); + + // convert the DataFusion logical plan to `optd`'s version of a `LogicalPlan`. + let logical_plan = converter + .conv_df_to_optd_relational(datafusion_logical_plan) + .expect("TODO FIX ERROR HANDLING"); + + // Run the `optd` optimizer on the `LogicalPlan`. + let optd_optimized_physical_plan = self + .mock_optimize(&logical_plan) + .await + .expect("TODO FIX ERROR HANDLING"); + + // Convert the output `optd` `PhysicalPlan` to DataFusion's `ExecutionPlan`. + let physical_plan = converter + .conv_optd_to_df_relational(&optd_optimized_physical_plan) + .await + .expect("TODO FIX ERROR HANDLING"); + + if let Some(mut explains) = explains { + explains.push( + displayable(&*physical_plan) + .to_stringified(false, DataFusionPlanType::FinalPhysicalPlan), + ); + + return Ok(Arc::new(ExplainExec::new( + DataFusionLogicalPlan::explain_schema(), + explains, + true, + ))); + } + + Ok(physical_plan) + } +} diff --git a/optd-datafusion/src/optd_utils.rs b/optd-datafusion/src/optd_utils.rs index 006d649..3c24cb3 100644 --- a/optd-datafusion/src/optd_utils.rs +++ b/optd-datafusion/src/optd_utils.rs @@ -1,5 +1,4 @@ -use crate::planner::OptdOptimizer; -use crate::planner::OptdQueryPlanner; +use crate::mock::MockOptdOptimizer; use datafusion::catalog::{CatalogProviderList, MemoryCatalogProviderList}; use datafusion::common::Result; use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; @@ -22,7 +21,7 @@ pub(crate) async fn create_optd_session( }) .with_information_schema(true); - // Disable Datafusion's heuristic rule-based optimizer by setting the passes to 1. + // Disable DataFusion's heuristic rule-based optimizer by setting the passes to 1. config.options_mut().optimizer.max_passes = 0; config }; @@ -36,10 +35,9 @@ pub(crate) async fn create_optd_session( datafusion_catalog.unwrap_or_else(|| Arc::new(MemoryCatalogProviderList::new())); // Use the `optd` optimizer as the query planner instead of the default one. - let optimizer = OptdOptimizer::new_in_memory() + let optimizer = MockOptdOptimizer::new_in_memory() .await .expect("TODO FIX ERROR HANDLING"); - let planner = Arc::new(OptdQueryPlanner::new(optimizer)); // Build up the state for the `SessionContext`. Removes all optimizer rules so that it // completely relies on `optd`. @@ -50,7 +48,7 @@ pub(crate) async fn create_optd_session( .with_default_features() .with_optimizer_rules(vec![]) .with_physical_optimizer_rules(vec![]) - .with_query_planner(planner) + .with_query_planner(Arc::new(optimizer)) .build(); // Create the `SessionContext` and refresh the catalogs to ensure everything is up-to-date. diff --git a/optd-datafusion/src/planner.rs b/optd-datafusion/src/planner.rs deleted file mode 100644 index 5e3895d..0000000 --- a/optd-datafusion/src/planner.rs +++ /dev/null @@ -1,215 +0,0 @@ -use crate::converter::OptdDFContext; -use anyhow::Ok; -use async_trait::async_trait; -use datafusion::{ - execution::{context::QueryPlanner, SessionState}, - logical_expr::{ - Explain, LogicalPlan as DFLogicalPlan, PlanType as DFPlanType, ToStringifiedPlan, - }, - physical_plan::{displayable, explain::ExplainExec, ExecutionPlan}, - physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, -}; -use optd_core::{ - plans::{logical::LogicalPlan, physical::PhysicalPlan}, - storage::memo::SqliteMemo, -}; -use std::sync::Arc; - -/// A mock optimizer for testing purposes. -#[derive(Debug)] -pub struct OptdOptimizer { - memo: SqliteMemo, -} - -impl OptdOptimizer { - pub async fn new_in_memory() -> anyhow::Result { - Ok(Self { - memo: SqliteMemo::new_in_memory().await?, - }) - } - - /// A mock optimization function for testing purposes. - /// - /// This function takes a logical plan, and for each node in the logical plan, it will - /// recursively traverse the node and its children and replace the node with a physical - /// operator. The physical operator is chosen based on the type of the logical operator. - /// For example, if the logical operator is a scan, the physical operator will be a - /// TableScan, if the logical operator is a filter, the physical operator will be a - /// Filter, and so on. - /// - /// The physical operators are chosen in a way that they mirror the structure of the - /// logical plan, but they are not actually optimized in any way. This is useful for - /// testing purposes, as it allows us to test the structure of the physical plan without - /// having to worry about the actual optimization process. - /// - /// The function returns a PhysicalPlan, which is a struct that contains the root node of - /// the physical plan. - /// - /// # Arguments - /// * `logical_plan` - The logical plan to optimize. - /// - /// # Returns - /// * `PhysicalPlan` - The optimized physical plan. - pub async fn mock_optimize( - &self, - logical_plan: &LogicalPlan, - ) -> anyhow::Result> { - let root_group_id = - optd_core::cascades::ingest_full_logical_plan(&self.memo, logical_plan).await?; - let goal_id = - optd_core::cascades::mock_optimize_relation_group(&self.memo, root_group_id).await?; - let optimized_plan = - optd_core::cascades::match_any_physical_plan(&self.memo, goal_id).await?; - - Ok(optimized_plan) - } -} - -/// A struct that implements the `QueryPlanner` trait for the `OptdQueryPlanner`. -/// This trait is used to create a physical plan for a given logical plan. -/// The physical plan is created by converting the logical plan to an optd logical plan, -/// and then running the optd optimizer on the logical plan and then converting it back. -/// This is the entry point for optd. -#[derive(Debug)] -pub struct OptdQueryPlanner { - pub optimizer: Arc, -} - -impl OptdQueryPlanner { - /// Creates a new instance of `OptdQueryPlanner` with the given optimizer. - /// - /// The optimizer is cloned and stored in an `Arc` so that it can be safely shared - /// across threads. - /// - /// # Arguments - /// * `optimizer` - The optimizer to use for creating the physical plan. - /// - /// # Returns - /// * `OptdQueryPlanner` - A new instance of `OptdQueryPlanner` with the given optimizer. - pub fn new(optimizer: OptdOptimizer) -> Self { - Self { - optimizer: Arc::new(optimizer), - } - } - - /// This function is the entry point for the physical planner. It will attempt - /// to optimize the logical plan using the optd optimizer. If the logical plan - /// is a DML/DDL operation, it will fall back to the datafusion planner. - /// - /// The steps of this function are the following: - /// - /// 1. Check if the logical plan is a DML/DDL operation. If it is, fall back - /// to the datafusion planner. - /// 2. Convert the logical plan to an optd logical plan. - /// 3. Run the optd optimizer on the logical plan. - /// 4. Convert the physical plan to a physical plan that can be executed by - /// datafusion. - /// - /// # Arguments - /// * `logical_plan` - The logical plan in Datafusion's type system to optimize. - /// * `session_state` - The session state to use for creating the physical plan. - /// - /// - /// # Returns - /// * `anyhow::Result>` - The physical plan that can be executed by - /// datafusion. - async fn create_physical_plan_inner( - &self, - logical_plan: &DFLogicalPlan, - session_state: &SessionState, - ) -> anyhow::Result> { - // Fallback to the datafusion planner for DML/DDL operations. optd cannot handle this. - if let DFLogicalPlan::Dml(_) | DFLogicalPlan::Ddl(_) | DFLogicalPlan::EmptyRelation(_) = - logical_plan - { - let planner = DefaultPhysicalPlanner::default(); - return Ok(planner - .create_physical_plan(logical_plan, session_state) - .await?); - } - - let (logical_plan, _verbose, mut explains) = match logical_plan { - DFLogicalPlan::Explain(Explain { plan, verbose, .. }) => { - (plan.as_ref(), *verbose, Some(Vec::new())) - } - _ => (logical_plan, false, None), - }; - - if let Some(explains) = &mut explains { - explains.push( - logical_plan.to_stringified(DFPlanType::OptimizedLogicalPlan { - optimizer_name: "datafusion".to_string(), - }), - ); - } - - let mut converter = OptdDFContext::new(session_state); - // convert the logical plan to optd - let logical_plan = converter.conv_df_to_optd_relational(logical_plan)?; - // run the optd optimizer - let optd_optimized_physical_plan = self.optimizer.mock_optimize(&logical_plan).await?; - // convert the physical plan to optd - let physical_plan = converter - .conv_optd_to_df_relational(&optd_optimized_physical_plan) - .await - .map_err(|e| anyhow::anyhow!(e))?; - - if let Some(explains) = &mut explains { - explains.push( - displayable(&*physical_plan).to_stringified(false, DFPlanType::FinalPhysicalPlan), - ); - } - - if let Some(explains) = explains { - Ok(Arc::new(ExplainExec::new( - DFLogicalPlan::explain_schema(), - explains, - true, - ))) - } else { - Ok(physical_plan) - } - } -} - -// making it `async_trait` only because datafusion is taking it. -#[async_trait] -impl QueryPlanner for OptdQueryPlanner { - /// This function is the entry point for the physical planner. It calls the inner function - /// `create_physical_plan_inner` to optimize the logical plan using the optd optimizer. If the logical plan - /// is a DML/DDL operation, it will fall back to the datafusion planner. - /// - /// The steps of this function are the following: - /// - /// 1. Check if the logical plan is a DML/DDL operation. If it is, fall back - /// to the datafusion planner. - /// 2. Convert the logical plan to an optd logical plan. - /// 3. Run the optd optimizer on the logical plan. - /// 4. Convert the physical plan to a physical plan that can be executed by - /// datafusion. - /// - /// - /// # Arguments - /// * `datafusion_logical_plan` - The logical plan in Datafusion's type system to optimize. - /// * `session_state` - The session state to use for creating the physical plan. - /// - /// # Returns - /// * `datafusion::common::Result>` - The physical plan that can be executed by - /// datafusion. - /// - /// Also see [`OptdQueryPlanner::create_physical_plan`] - async fn create_physical_plan( - &self, - datafusion_logical_plan: &DFLogicalPlan, - session_state: &SessionState, - ) -> datafusion::common::Result> { - self.create_physical_plan_inner(datafusion_logical_plan, session_state) - .await - .map_err(|x| { - datafusion::error::DataFusionError::Execution(format!( - "Failed to create physical plan: {:?}", - x - )) - }) - } -} From e3472f466b4035a5195e13ff91aa40a8601085f8 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 2 Mar 2025 15:52:21 -0500 Subject: [PATCH 4/7] remove optd-datafusion-cli --- Cargo.lock | 1748 +---------------- Cargo.toml | 2 +- optd-datafusion-cli/Cargo.toml | 89 - optd-datafusion-cli/Dockerfile | 38 - optd-datafusion-cli/README.md | 48 - .../examples/cli-session-context.rs | 92 - optd-datafusion-cli/src/catalog.rs | 365 ---- optd-datafusion-cli/src/cli_context.rs | 92 - optd-datafusion-cli/src/command.rs | 222 --- optd-datafusion-cli/src/exec.rs | 625 ------ optd-datafusion-cli/src/functions.rs | 457 ----- optd-datafusion-cli/src/helper.rs | 378 ---- optd-datafusion-cli/src/highlighter.rs | 127 -- optd-datafusion-cli/src/lib.rs | 31 - optd-datafusion-cli/src/main.rs | 446 ----- optd-datafusion-cli/src/object_storage.rs | 632 ------ optd-datafusion-cli/src/pool_type.rs | 48 - optd-datafusion-cli/src/print_format.rs | 691 ------- optd-datafusion-cli/src/print_options.rs | 170 -- optd-datafusion-cli/tests/cli_integration.rs | 57 - optd-datafusion-cli/tests/data/sql.txt | 1 - 21 files changed, 41 insertions(+), 6318 deletions(-) delete mode 100644 optd-datafusion-cli/Cargo.toml delete mode 100644 optd-datafusion-cli/Dockerfile delete mode 100644 optd-datafusion-cli/README.md delete mode 100644 optd-datafusion-cli/examples/cli-session-context.rs delete mode 100644 optd-datafusion-cli/src/catalog.rs delete mode 100644 optd-datafusion-cli/src/cli_context.rs delete mode 100644 optd-datafusion-cli/src/command.rs delete mode 100644 optd-datafusion-cli/src/exec.rs delete mode 100644 optd-datafusion-cli/src/functions.rs delete mode 100644 optd-datafusion-cli/src/helper.rs delete mode 100644 optd-datafusion-cli/src/highlighter.rs delete mode 100644 optd-datafusion-cli/src/lib.rs delete mode 100644 optd-datafusion-cli/src/main.rs delete mode 100644 optd-datafusion-cli/src/object_storage.rs delete mode 100644 optd-datafusion-cli/src/pool_type.rs delete mode 100644 optd-datafusion-cli/src/print_format.rs delete mode 100644 optd-datafusion-cli/src/print_options.rs delete mode 100644 optd-datafusion-cli/tests/cli_integration.rs delete mode 100644 optd-datafusion-cli/tests/data/sql.txt diff --git a/Cargo.lock b/Cargo.lock index a82b5ea..fd2d5dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,12 +17,6 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" -[[package]] -name = "adler32" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" - [[package]] name = "ahash" version = "0.8.11" @@ -138,35 +132,6 @@ version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b964d184e89d9b6b67dd2715bc8e74cf3107fb2b529990c90cf517326150bf4" -[[package]] -name = "apache-avro" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aef82843a0ec9f8b19567445ad2421ceeb1d711514384bdd3d49fe37102ee13" -dependencies = [ - "bigdecimal", - "bzip2 0.4.4", - "crc32fast", - "digest", - "libflate", - "log", - "num-bigint", - "quad-rand", - "rand", - "regex-lite", - "serde", - "serde_bytes", - "serde_json", - "snap", - "strum", - "strum_macros", - "thiserror 1.0.69", - "typed-builder", - "uuid", - "xz2", - "zstd", -] - [[package]] name = "ariadne" version = "0.5.0" @@ -264,7 +229,7 @@ dependencies = [ "arrow-schema", "arrow-select", "atoi", - "base64 0.22.1", + "base64", "chrono", "comfy-table", "half", @@ -398,29 +363,13 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "assert_cmd" -version = "2.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1835b7f27878de8525dc71410b5a31cdcc5f230aed5ba5df968e09c201b23d" -dependencies = [ - "anstyle", - "bstr", - "doc-comment", - "libc", - "predicates", - "predicates-core", - "predicates-tree", - "wait-timeout", -] - [[package]] name = "async-compression" version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" dependencies = [ - "bzip2 0.5.1", + "bzip2", "flate2", "futures-core", "memchr", @@ -462,318 +411,12 @@ dependencies = [ "num-traits", ] -[[package]] -name = "atomic-waker" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" - [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" -[[package]] -name = "aws-config" -version = "1.5.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "490aa7465ee685b2ced076bb87ef654a47724a7844e2c7d3af4e749ce5b875dd" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-sdk-sso", - "aws-sdk-ssooidc", - "aws-sdk-sts", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "hex", - "http 0.2.12", - "ring", - "time", - "tokio", - "tracing", - "url", - "zeroize", -] - -[[package]] -name = "aws-credential-types" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" -dependencies = [ - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "zeroize", -] - -[[package]] -name = "aws-runtime" -version = "1.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76dd04d39cc12844c0994f2c9c5a6f5184c22e9188ec1ff723de41910a21dcad" -dependencies = [ - "aws-credential-types", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "http 0.2.12", - "http-body 0.4.6", - "once_cell", - "percent-encoding", - "pin-project-lite", - "tracing", - "uuid", -] - -[[package]] -name = "aws-sdk-sso" -version = "1.60.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60186fab60b24376d3e33b9ff0a43485f99efd470e3b75a9160c849741d63d56" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "http 0.2.12", - "once_cell", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sdk-ssooidc" -version = "1.61.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7033130ce1ee13e6018905b7b976c915963755aef299c1521897679d6cd4f8ef" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "http 0.2.12", - "once_cell", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sdk-sts" -version = "1.61.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5c1cac7677179d622b4448b0d31bcb359185295dc6fca891920cfb17e2b5156" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-query", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-smithy-xml", - "aws-types", - "http 0.2.12", - "once_cell", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sigv4" -version = "1.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bfe75fad52793ce6dec0dc3d4b1f388f038b5eb866c8d4d7f3a8e21b5ea5051" -dependencies = [ - "aws-credential-types", - "aws-smithy-http", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "form_urlencoded", - "hex", - "hmac", - "http 0.2.12", - "http 1.2.0", - "once_cell", - "percent-encoding", - "sha2", - "time", - "tracing", -] - -[[package]] -name = "aws-smithy-async" -version = "1.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa59d1327d8b5053c54bf2eaae63bf629ba9e904434d0835a28ed3c0ed0a614e" -dependencies = [ - "futures-util", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "aws-smithy-http" -version = "0.60.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7809c27ad8da6a6a68c454e651d4962479e81472aa19ae99e59f9aba1f9713cc" -dependencies = [ - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http-body 0.4.6", - "once_cell", - "percent-encoding", - "pin-project-lite", - "pin-utils", - "tracing", -] - -[[package]] -name = "aws-smithy-json" -version = "0.61.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "623a51127f24c30776c8b374295f2df78d92517386f77ba30773f15a30ce1422" -dependencies = [ - "aws-smithy-types", -] - -[[package]] -name = "aws-smithy-query" -version = "0.60.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" -dependencies = [ - "aws-smithy-types", - "urlencoding", -] - -[[package]] -name = "aws-smithy-runtime" -version = "1.7.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d526a12d9ed61fadefda24abe2e682892ba288c2018bcb38b1b4c111d13f6d92" -dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "fastrand", - "h2 0.3.26", - "http 0.2.12", - "http-body 0.4.6", - "http-body 1.0.1", - "httparse", - "hyper 0.14.32", - "hyper-rustls 0.24.2", - "once_cell", - "pin-project-lite", - "pin-utils", - "rustls 0.21.12", - "tokio", - "tracing", -] - -[[package]] -name = "aws-smithy-runtime-api" -version = "1.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd" -dependencies = [ - "aws-smithy-async", - "aws-smithy-types", - "bytes", - "http 0.2.12", - "http 1.2.0", - "pin-project-lite", - "tokio", - "tracing", - "zeroize", -] - -[[package]] -name = "aws-smithy-types" -version = "1.2.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7b8a53819e42f10d0821f56da995e1470b199686a1809168db6ca485665f042" -dependencies = [ - "base64-simd", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http 1.2.0", - "http-body 0.4.6", - "http-body 1.0.1", - "http-body-util", - "itoa", - "num-integer", - "pin-project-lite", - "pin-utils", - "ryu", - "serde", - "time", - "tokio", - "tokio-util", -] - -[[package]] -name = "aws-smithy-xml" -version = "0.60.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc" -dependencies = [ - "xmlparser", -] - -[[package]] -name = "aws-types" -version = "1.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbd0a668309ec1f66c0f6bda4840dd6d4796ae26d699ebc266d7cc95c6d040f" -dependencies = [ - "aws-credential-types", - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "rustc_version", - "tracing", -] - [[package]] name = "backtrace" version = "0.3.74" @@ -789,28 +432,12 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "base64" -version = "0.21.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" - [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "base64-simd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" -dependencies = [ - "outref", - "vsimd", -] - [[package]] name = "base64ct" version = "1.6.0" @@ -828,7 +455,6 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", - "serde", ] [[package]] @@ -839,9 +465,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.8.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" dependencies = [ "serde", ] @@ -898,17 +524,6 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "bstr" -version = "1.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" -dependencies = [ - "memchr", - "regex-automata", - "serde", -] - [[package]] name = "bumpalo" version = "3.17.0" @@ -927,51 +542,30 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f61dac84819c6588b558454b194026eb1f09c293b9036ae9b159e74e73ab6cf9" -[[package]] -name = "bytes-utils" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" -dependencies = [ - "bytes", - "either", -] - -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - [[package]] name = "bzip2" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75b89e7c29231c673a61a46e722602bcd138298f6b9e81e71119693534585f5c" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ "bzip2-sys", ] [[package]] name = "bzip2-sys" -version = "0.1.12+1.0.8" +version = "0.1.13+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72ebc2f1a417f01e1da30ef264ee86ae31d2dcd2d603ea283d3c244a883ca2a9" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" dependencies = [ "cc", - "libc", "pkg-config", ] [[package]] name = "cc" -version = "1.2.15" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c736e259eea577f443d5c86c304f9f4ae0295c43f3ba05c21f1d66b5f06001af" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ "jobserver", "libc", @@ -984,12 +578,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "cfg_aliases" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" - [[package]] name = "chrono" version = "0.4.39" @@ -999,7 +587,6 @@ dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", - "serde", "windows-targets 0.52.6", ] @@ -1074,15 +661,6 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" -[[package]] -name = "clipboard-win" -version = "5.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15efe7a882b08f34e38556b14f2fb3daa98769d06c7f0c1b076dfd0d983bc892" -dependencies = [ - "error-code", -] - [[package]] name = "colorchoice" version = "1.0.3" @@ -1140,41 +718,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "core-foundation" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "core-foundation-sys" version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "core2" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" -dependencies = [ - "memchr", -] - [[package]] name = "cpufeatures" version = "0.2.17" @@ -1261,26 +810,10 @@ dependencies = [ ] [[package]] -name = "ctor" -version = "0.2.9" +name = "dashmap" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a2785755761f3ddc1492979ce1e48d2c00d09311c39e4466429188f3dd6501" -dependencies = [ - "quote", - "syn", -] - -[[package]] -name = "dary_heap" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" - -[[package]] -name = "dashmap" -version = "6.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1296,7 +829,6 @@ version = "45.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ - "apache-avro", "arrow", "arrow-array", "arrow-ipc", @@ -1304,7 +836,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2 0.5.1", + "bzip2", "chrono", "datafusion-catalog", "datafusion-common", @@ -1327,7 +859,6 @@ dependencies = [ "glob", "itertools 0.14.0", "log", - "num-traits", "object_store", "parking_lot", "parquet", @@ -1371,13 +902,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash", - "apache-avro", "arrow", "arrow-array", "arrow-buffer", "arrow-ipc", "arrow-schema", - "base64 0.22.1", + "base64", "half", "hashbrown 0.14.5", "indexmap", @@ -1468,7 +998,7 @@ checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", - "base64 0.22.1", + "base64", "blake2", "blake3", "chrono", @@ -1747,21 +1277,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "deranged" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] - -[[package]] -name = "difflib" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" - [[package]] name = "digest" version = "0.10.7" @@ -1774,27 +1289,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "dirs" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" -dependencies = [ - "libc", - "option-ext", - "redox_users", - "windows-sys 0.59.0", -] - [[package]] name = "displaydoc" version = "0.2.5" @@ -1806,12 +1300,6 @@ dependencies = [ "syn", ] -[[package]] -name = "doc-comment" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" - [[package]] name = "dotenvy" version = "0.15.7" @@ -1827,12 +1315,6 @@ dependencies = [ "serde", ] -[[package]] -name = "endian-type" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" - [[package]] name = "enum_dispatch" version = "0.3.13" @@ -1845,29 +1327,6 @@ dependencies = [ "syn", ] -[[package]] -name = "env_filter" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "humantime", - "log", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -1884,12 +1343,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "error-code" -version = "3.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d9305ccc6942a704f4335694ecd3de2ea531b114ac2d51f5f843750787a92f" - [[package]] name = "etcetera" version = "0.8.0" @@ -1918,17 +1371,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "fd-lock" -version = "4.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e5768da2206272c81ef0b5e951a41862938a6070da63bcea197899942d3b947" -dependencies = [ - "cfg-if", - "rustix", - "windows-sys 0.52.0", -] - [[package]] name = "fixedbitset" version = "0.5.7" @@ -1955,15 +1397,6 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "float-cmp" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" -dependencies = [ - "num-traits", -] - [[package]] name = "flume" version = "0.11.1" @@ -1975,12 +1408,6 @@ dependencies = [ "spin", ] -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - [[package]] name = "foldhash" version = "0.1.4" @@ -2078,12 +1505,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" -[[package]] -name = "futures-timer" -version = "3.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" - [[package]] name = "futures-util" version = "0.3.31" @@ -2119,10 +1540,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", - "js-sys", "libc", "wasi 0.11.0+wasi-snapshot-preview1", - "wasm-bindgen", ] [[package]] @@ -2149,44 +1568,6 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" -[[package]] -name = "h2" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "h2" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2" -dependencies = [ - "atomic-waker", - "bytes", - "fnv", - "futures-core", - "futures-sink", - "http 1.2.0", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] - [[package]] name = "half" version = "2.4.1" @@ -2267,177 +1648,12 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "http" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "http" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "http-body" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http 0.2.12", - "pin-project-lite", -] - -[[package]] -name = "http-body" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" -dependencies = [ - "bytes", - "http 1.2.0", -] - -[[package]] -name = "http-body-util" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" -dependencies = [ - "bytes", - "futures-util", - "http 1.2.0", - "http-body 1.0.1", - "pin-project-lite", -] - -[[package]] -name = "httparse" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" - -[[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" -[[package]] -name = "hyper" -version = "0.14.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.26", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" -dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "h2 0.4.8", - "http 1.2.0", - "http-body 1.0.1", - "httparse", - "itoa", - "pin-project-lite", - "smallvec", - "tokio", - "want", -] - -[[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.32", - "log", - "rustls 0.21.12", - "rustls-native-certs 0.6.3", - "tokio", - "tokio-rustls 0.24.1", -] - -[[package]] -name = "hyper-rustls" -version = "0.27.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" -dependencies = [ - "futures-util", - "http 1.2.0", - "hyper 1.6.0", - "hyper-util", - "rustls 0.23.23", - "rustls-native-certs 0.8.1", - "rustls-pki-types", - "tokio", - "tokio-rustls 0.26.1", - "tower-service", -] - -[[package]] -name = "hyper-util" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" -dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "http 1.2.0", - "http-body 1.0.1", - "hyper 1.6.0", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", -] - [[package]] name = "iana-time-zone" version = "0.1.61" @@ -2616,12 +1832,6 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "ipnet" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" - [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -2750,56 +1960,12 @@ version = "0.2.170" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" -[[package]] -name = "libflate" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" -dependencies = [ - "adler32", - "core2", - "crc32fast", - "dary_heap", - "libflate_lz77", -] - -[[package]] -name = "libflate_lz77" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" -dependencies = [ - "core2", - "hashbrown 0.14.5", - "rle-decode-fast", -] - [[package]] name = "libm" version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" -[[package]] -name = "libmimalloc-sys" -version = "0.1.39" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" -dependencies = [ - "cc", - "libc", -] - -[[package]] -name = "libredox" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" -dependencies = [ - "bitflags 2.8.0", - "libc", -] - [[package]] name = "libsqlite3-sys" version = "0.30.1" @@ -2875,21 +2041,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "mimalloc" -version = "0.1.43" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" -dependencies = [ - "libmimalloc-sys", -] - -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - [[package]] name = "miniz_oxide" version = "0.8.5" @@ -2910,33 +2061,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "nibble_vec" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" -dependencies = [ - "smallvec", -] - -[[package]] -name = "nix" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" -dependencies = [ - "bitflags 2.8.0", - "cfg-if", - "cfg_aliases", - "libc", -] - -[[package]] -name = "normalize-line-endings" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" - [[package]] name = "num" version = "0.4.3" @@ -2959,7 +2083,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", - "serde", ] [[package]] @@ -2988,12 +2111,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-conv" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" - [[package]] name = "num-integer" version = "0.1.46" @@ -3051,23 +2168,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" dependencies = [ "async-trait", - "base64 0.22.1", "bytes", "chrono", "futures", "humantime", - "hyper 1.6.0", "itertools 0.13.0", - "md-5", "parking_lot", "percent-encoding", - "quick-xml", - "rand", - "reqwest", - "ring", - "rustls-pemfile 2.2.0", - "serde", - "serde_json", "snafu", "tokio", "tracing", @@ -3081,12 +2188,6 @@ version = "1.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" -[[package]] -name = "openssl-probe" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" - [[package]] name = "optd-core" version = "0.1.0" @@ -3119,39 +2220,6 @@ dependencies = [ "trait-variant", ] -[[package]] -name = "optd-datafusion-cli" -version = "45.0.0" -dependencies = [ - "arrow", - "assert_cmd", - "async-trait", - "aws-config", - "aws-credential-types", - "aws-sdk-sso", - "aws-sdk-ssooidc", - "aws-sdk-sts", - "clap", - "ctor", - "datafusion", - "datafusion-catalog", - "dirs", - "env_logger", - "futures", - "home", - "mimalloc", - "object_store", - "optd-datafusion", - "parking_lot", - "parquet", - "predicates", - "regex", - "rstest", - "rustyline", - "tokio", - "url", -] - [[package]] name = "optd-dsl" version = "0.1.0" @@ -3167,12 +2235,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "option-ext" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" - [[package]] name = "ordered-float" version = "2.10.1" @@ -3193,12 +2255,6 @@ dependencies = [ "serde", ] -[[package]] -name = "outref" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" - [[package]] name = "parking" version = "2.2.1" @@ -3242,7 +2298,7 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-select", - "base64 0.22.1", + "base64", "brotli", "bytes", "chrono", @@ -3382,12 +2438,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - [[package]] name = "ppv-lite86" version = "0.2.20" @@ -3397,45 +2447,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "predicates" -version = "3.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5d19ee57562043d37e82899fade9a22ebab7be9cef5026b07fda9cdd4293573" -dependencies = [ - "anstyle", - "difflib", - "float-cmp", - "normalize-line-endings", - "predicates-core", - "regex", -] - -[[package]] -name = "predicates-core" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "727e462b119fe9c93fd0eb1429a5f7647394014cf3c04ab2c0350eeb09095ffa" - -[[package]] -name = "predicates-tree" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c" -dependencies = [ - "predicates-core", - "termtree", -] - -[[package]] -name = "proc-macro-crate" -version = "3.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" -dependencies = [ - "toml_edit", -] - [[package]] name = "proc-macro2" version = "1.0.93" @@ -3454,74 +2465,6 @@ dependencies = [ "cc", ] -[[package]] -name = "quad-rand" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" - -[[package]] -name = "quick-xml" -version = "0.37.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" -dependencies = [ - "memchr", - "serde", -] - -[[package]] -name = "quinn" -version = "0.11.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" -dependencies = [ - "bytes", - "pin-project-lite", - "quinn-proto", - "quinn-udp", - "rustc-hash", - "rustls 0.23.23", - "socket2", - "thiserror 2.0.11", - "tokio", - "tracing", -] - -[[package]] -name = "quinn-proto" -version = "0.11.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" -dependencies = [ - "bytes", - "getrandom 0.2.15", - "rand", - "ring", - "rustc-hash", - "rustls 0.23.23", - "rustls-pki-types", - "slab", - "thiserror 2.0.11", - "tinyvec", - "tracing", - "web-time", -] - -[[package]] -name = "quinn-udp" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" -dependencies = [ - "cfg_aliases", - "libc", - "once_cell", - "socket2", - "tracing", - "windows-sys 0.59.0", -] - [[package]] name = "quote" version = "1.0.38" @@ -3531,16 +2474,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "radix_trie" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" -dependencies = [ - "endian-type", - "nibble_vec", -] - [[package]] name = "rand" version = "0.8.5" @@ -3599,18 +2532,7 @@ version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82b568323e98e49e2a0899dcee453dd679fae22d69adf9b11dd508d1549b7e2f" dependencies = [ - "bitflags 2.8.0", -] - -[[package]] -name = "redox_users" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" -dependencies = [ - "getrandom 0.2.15", - "libredox", - "thiserror 2.0.11", + "bitflags 2.9.0", ] [[package]] @@ -3636,90 +2558,12 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "regex-lite" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" - [[package]] name = "regex-syntax" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" -[[package]] -name = "relative-path" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" - -[[package]] -name = "reqwest" -version = "0.12.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" -dependencies = [ - "base64 0.22.1", - "bytes", - "futures-core", - "futures-util", - "h2 0.4.8", - "http 1.2.0", - "http-body 1.0.1", - "http-body-util", - "hyper 1.6.0", - "hyper-rustls 0.27.5", - "hyper-util", - "ipnet", - "js-sys", - "log", - "mime", - "once_cell", - "percent-encoding", - "pin-project-lite", - "quinn", - "rustls 0.23.23", - "rustls-native-certs 0.8.1", - "rustls-pemfile 2.2.0", - "rustls-pki-types", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tokio-rustls 0.26.1", - "tokio-util", - "tower", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-streams", - "web-sys", - "windows-registry", -] - -[[package]] -name = "ring" -version = "0.17.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5349ae27d3887ca812fb375b45a4fbb36d8d12d2df394968cd86e35683fe73" -dependencies = [ - "cc", - "cfg-if", - "getrandom 0.2.15", - "libc", - "untrusted", - "windows-sys 0.52.0", -] - -[[package]] -name = "rle-decode-fast" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" - [[package]] name = "rsa" version = "0.9.7" @@ -3740,166 +2584,32 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rstest" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03e905296805ab93e13c1ec3a03f4b6c4f35e9498a3d5fa96dc626d22c03cd89" -dependencies = [ - "futures-timer", - "futures-util", - "rstest_macros", - "rustc_version", -] - -[[package]] -name = "rstest_macros" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef0053bbffce09062bee4bcc499b0fbe7a57b879f1efe088d6d8d4c7adcdef9b" -dependencies = [ - "cfg-if", - "glob", - "proc-macro-crate", - "proc-macro2", - "quote", - "regex", - "relative-path", - "rustc_version", - "syn", - "unicode-ident", -] - [[package]] name = "rustc-demangle" version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" - -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - -[[package]] -name = "rustix" -version = "0.38.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags 2.8.0", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.59.0", -] - -[[package]] -name = "rustls" -version = "0.21.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" -dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.7", - "sct", -] - -[[package]] -name = "rustls" -version = "0.23.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" -dependencies = [ - "once_cell", - "ring", - "rustls-pki-types", - "rustls-webpki 0.102.8", - "subtle", - "zeroize", -] - -[[package]] -name = "rustls-native-certs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" -dependencies = [ - "openssl-probe", - "rustls-pemfile 1.0.4", - "schannel", - "security-framework 2.11.1", -] - -[[package]] -name = "rustls-native-certs" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" -dependencies = [ - "openssl-probe", - "rustls-pki-types", - "schannel", - "security-framework 3.2.0", -] - -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", -] - -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - -[[package]] -name = "rustls-pki-types" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" -dependencies = [ - "web-time", -] +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] -name = "rustls-webpki" -version = "0.101.7" +name = "rustc_version" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "ring", - "untrusted", + "semver", ] [[package]] -name = "rustls-webpki" -version = "0.102.8" +name = "rustix" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "ring", - "rustls-pki-types", - "untrusted", + "bitflags 2.9.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", ] [[package]] @@ -3908,28 +2618,6 @@ version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" -[[package]] -name = "rustyline" -version = "15.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ee1e066dc922e513bda599c6ccb5f3bb2b0ea5870a579448f2622993f0a9a2f" -dependencies = [ - "bitflags 2.8.0", - "cfg-if", - "clipboard-win", - "fd-lock", - "home", - "libc", - "log", - "memchr", - "nix", - "radix_trie", - "unicode-segmentation", - "unicode-width 0.2.0", - "utf8parse", - "windows-sys 0.59.0", -] - [[package]] name = "ryu" version = "1.0.19" @@ -3945,67 +2633,12 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "schannel" -version = "0.1.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" -dependencies = [ - "windows-sys 0.59.0", -] - [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "security-framework" -version = "2.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" -dependencies = [ - "bitflags 2.8.0", - "core-foundation 0.9.4", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework" -version = "3.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" -dependencies = [ - "bitflags 2.8.0", - "core-foundation 0.10.0", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework-sys" -version = "2.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "semver" version = "1.0.25" @@ -4027,15 +2660,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_bytes" -version = "0.11.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" -dependencies = [ - "serde", -] - [[package]] name = "serde_derive" version = "1.0.218" @@ -4264,7 +2888,7 @@ dependencies = [ "serde_json", "sha2", "smallvec", - "thiserror 2.0.11", + "thiserror", "tokio", "tokio-stream", "tracing", @@ -4317,8 +2941,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4560278f0e00ce64938540546f59f590d60beee33fffbd3b9cd47851e5fff233" dependencies = [ "atoi", - "base64 0.22.1", - "bitflags 2.8.0", + "base64", + "bitflags 2.9.0", "byteorder", "bytes", "crc", @@ -4347,7 +2971,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.11", + "thiserror", "tracing", "whoami", ] @@ -4359,8 +2983,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5b98a57f363ed6764d5b3a12bfedf62f07aa16e1856a7ddc2a0bb190a959613" dependencies = [ "atoi", - "base64 0.22.1", - "bitflags 2.8.0", + "base64", + "bitflags 2.9.0", "byteorder", "crc", "dotenvy", @@ -4384,7 +3008,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.11", + "thiserror", "tracing", "whoami", ] @@ -4454,25 +3078,6 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn", -] - [[package]] name = "subtle" version = "2.6.1" @@ -4490,15 +3095,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "sync_wrapper" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" -dependencies = [ - "futures-core", -] - [[package]] name = "synstructure" version = "0.13.1" @@ -4524,39 +3120,13 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "termtree" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" - -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - [[package]] name = "thiserror" version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl 2.0.11", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -4581,36 +3151,6 @@ dependencies = [ "ordered-float 2.10.1", ] -[[package]] -name = "time" -version = "0.3.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" -dependencies = [ - "deranged", - "num-conv", - "powerfmt", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" - -[[package]] -name = "time-macros" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" -dependencies = [ - "num-conv", - "time-core", -] - [[package]] name = "tiny-keccak" version = "2.0.2" @@ -4674,26 +3214,6 @@ dependencies = [ "syn", ] -[[package]] -name = "tokio-rustls" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" -dependencies = [ - "rustls 0.21.12", - "tokio", -] - -[[package]] -name = "tokio-rustls" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" -dependencies = [ - "rustls 0.23.23", - "tokio", -] - [[package]] name = "tokio-stream" version = "0.1.17" @@ -4718,50 +3238,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml_datetime" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" - -[[package]] -name = "toml_edit" -version = "0.22.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b4795ff5edd201c7cd6dca065ae59972ce77d1b80fa0a84d94950ece7d1474" -dependencies = [ - "indexmap", - "toml_datetime", - "winnow", -] - -[[package]] -name = "tower" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" -dependencies = [ - "futures-core", - "futures-util", - "pin-project-lite", - "sync_wrapper", - "tokio", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" - -[[package]] -name = "tower-service" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" - [[package]] name = "tracing" version = "0.1.41" @@ -4805,12 +3281,6 @@ dependencies = [ "syn", ] -[[package]] -name = "try-lock" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" - [[package]] name = "twox-hash" version = "1.6.3" @@ -4821,26 +3291,6 @@ dependencies = [ "static_assertions", ] -[[package]] -name = "typed-builder" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06fbd5b8de54c5f7c91f6fe4cebb949be2125d7758e630bb58b1d831dbce600" -dependencies = [ - "typed-builder-macro", -] - -[[package]] -name = "typed-builder-macro" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "typenum" version = "1.18.0" @@ -4892,12 +3342,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" -[[package]] -name = "untrusted" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" - [[package]] name = "url" version = "2.5.4" @@ -4909,12 +3353,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "urlencoding" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" - [[package]] name = "utf16_iter" version = "1.0.5" @@ -4940,7 +3378,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" dependencies = [ "getrandom 0.3.1", - "serde", ] [[package]] @@ -4955,21 +3392,6 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" -[[package]] -name = "vsimd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" - -[[package]] -name = "wait-timeout" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" -dependencies = [ - "libc", -] - [[package]] name = "walkdir" version = "2.5.0" @@ -4980,15 +3402,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "want" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" -dependencies = [ - "try-lock", -] - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -5036,19 +3449,6 @@ dependencies = [ "wasm-bindgen-shared", ] -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.50" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" -dependencies = [ - "cfg-if", - "js-sys", - "once_cell", - "wasm-bindgen", - "web-sys", -] - [[package]] name = "wasm-bindgen-macro" version = "0.2.100" @@ -5081,29 +3481,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-streams" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" -dependencies = [ - "futures-util", - "js-sys", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", -] - -[[package]] -name = "web-sys" -version = "0.3.77" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "web-time" version = "1.1.0" @@ -5142,36 +3519,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-registry" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" -dependencies = [ - "windows-result", - "windows-strings", - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-result" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" -dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-strings" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" -dependencies = [ - "windows-result", - "windows-targets 0.52.6", -] - [[package]] name = "windows-sys" version = "0.48.0" @@ -5320,22 +3667,13 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winnow" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7f4ea97f6f78012141bcdb6a216b2609f0979ada50b20ca5b52dde2eac2bb1" -dependencies = [ - "memchr", -] - [[package]] name = "wit-bindgen-rt" version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" dependencies = [ - "bitflags 2.8.0", + "bitflags 2.9.0", ] [[package]] @@ -5350,12 +3688,6 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" -[[package]] -name = "xmlparser" -version = "0.13.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" - [[package]] name = "xz2" version = "0.1.7" diff --git a/Cargo.toml b/Cargo.toml index aeaeddd..8206ac3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["optd-core", "optd-dsl", "optd-datafusion", "optd-datafusion-cli"] +members = ["optd-core", "optd-dsl", "optd-datafusion"] resolver = "2" [workspace.package] diff --git a/optd-datafusion-cli/Cargo.toml b/optd-datafusion-cli/Cargo.toml deleted file mode 100644 index fe67c83..0000000 --- a/optd-datafusion-cli/Cargo.toml +++ /dev/null @@ -1,89 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -[package] -name = "optd-datafusion-cli" -description = "Command Line Client for DataFusion query engine." -version = "45.0.0" -authors = ["Apache DataFusion "] -edition = "2021" -keywords = ["arrow", "datafusion", "query", "sql"] -license = "Apache-2.0" -homepage = "https://datafusion.apache.org" -repository = "https://github.com/apache/datafusion" -rust-version = "1.81.0" -readme = "README.md" - -[dependencies] -arrow = { version = "54.1.0" } -async-trait = "0.1.0" -aws-config = "1.5.0" -aws-credential-types = "1.2.0" -aws-sdk-sso = "1.57.0" -aws-sdk-ssooidc = "1.57.0" -aws-sdk-sts = "1.57.0" -clap = { version = "4.5.27", features = ["derive", "cargo"] } -datafusion = { workspace = true, features = [ - "avro", - "crypto_expressions", - "datetime_expressions", - "encoding_expressions", - "parquet", - "recursive_protection", - "regex_expressions", - "unicode_expressions", - "compression", -] } -datafusion-catalog = { version = "45.0.0" } -optd-datafusion = { path = "../optd-datafusion" } -dirs = "6.0.0" -env_logger = "0.11" -futures = "0.3" -# pin as home 0.5.11 has MSRV 1.81. Can remove this once we bump MSRV to 1.81 -home = "=0.5.11" -mimalloc = { version = "0.1", default-features = false } -object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] } -parking_lot = { version = "0.12" } -parquet = { version = "54.1.0", default-features = false } -regex = "1.8" -rustyline = "15.0" -tokio = { version = "1.24", features = [ - "macros", - "rt", - "rt-multi-thread", - "sync", - "parking_lot", - "signal", -] } -url = "2.5.4" - -[dev-dependencies] -assert_cmd = "2.0" -ctor = "0.2.9" -predicates = "3.0" -rstest = "0.24" - -# [profile.ci] -# inherits = "dev" -# incremental = false - -# # ci turns off debug info, etc for dependencies to allow for smaller binaries making caching more effective -# [profile.ci.package."*"] -# debug = false -# debug-assertions = false -# strip = "debuginfo" -# incremental = false diff --git a/optd-datafusion-cli/Dockerfile b/optd-datafusion-cli/Dockerfile deleted file mode 100644 index f73b76b..0000000 --- a/optd-datafusion-cli/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM rust:bookworm AS builder - -COPY . /usr/src/datafusion -COPY ./datafusion /usr/src/datafusion/datafusion -COPY ./datafusion-cli /usr/src/datafusion/datafusion-cli - -WORKDIR /usr/src/datafusion/datafusion-cli - -RUN rustup component add rustfmt - -RUN cargo build --release - -FROM debian:bookworm-slim - -COPY --from=builder /usr/src/datafusion/datafusion-cli/target/release/datafusion-cli /usr/local/bin - -RUN mkdir /data - -ENTRYPOINT ["datafusion-cli"] - -CMD ["--data-path", "/data"] diff --git a/optd-datafusion-cli/README.md b/optd-datafusion-cli/README.md deleted file mode 100644 index ce09c3b..0000000 --- a/optd-datafusion-cli/README.md +++ /dev/null @@ -1,48 +0,0 @@ - - - - -# DataFusion Command-line Interface - -[DataFusion](https://datafusion.apache.org/) is an extensible query execution framework, written in Rust, that uses Apache Arrow as its in-memory format. - -DataFusion CLI (`datafusion-cli`) is a small command line utility that runs SQL queries using the DataFusion engine. - -# Frequently Asked Questions - -## Where can I find more information? - -See the [`datafusion-cli` documentation](https://datafusion.apache.org/user-guide/cli/index.html) for further information. - -## How do I make my IDE work with `datafusion-cli`? - -"open" the `datafusion/datafusion-cli` project as its own top level -project in my IDE (rather than opening `datafusion`) - -The reason `datafusion-cli` is not part of the main workspace in -[`datafusion Cargo.toml`] file is that `datafusion-cli` is a binary and has a -checked in `Cargo.lock` file to ensure reproducible builds. - -However, the `datafusion` and sub crates are intended for use as libraries and -thus do not have a `Cargo.lock` file checked in, as described in the [main -README] file. - -[`datafusion cargo.toml`]: https://github.com/apache/datafusion/blob/main/Cargo.toml -[main readme]: ../README.md diff --git a/optd-datafusion-cli/examples/cli-session-context.rs b/optd-datafusion-cli/examples/cli-session-context.rs deleted file mode 100644 index 080a589..0000000 --- a/optd-datafusion-cli/examples/cli-session-context.rs +++ /dev/null @@ -1,92 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Shows an example of a custom session context that unions the input plan with itself. -//! To run this example, use `cargo run --example cli-session-context` from within the `datafusion-cli` directory. - -use std::sync::Arc; - -use datafusion::{ - dataframe::DataFrame, - error::DataFusionError, - execution::{context::SessionState, TaskContext}, - logical_expr::{LogicalPlan, LogicalPlanBuilder}, - prelude::SessionContext, -}; -use object_store::ObjectStore; -use optd_datafusion_cli::{ - cli_context::CliSessionContext, exec::exec_from_repl, print_options::PrintOptions, -}; - -/// This is a toy example of a custom session context that unions the input plan with itself. -struct MyUnionerContext { - ctx: SessionContext, -} - -impl Default for MyUnionerContext { - fn default() -> Self { - Self { - ctx: SessionContext::new(), - } - } -} - -#[async_trait::async_trait] -impl CliSessionContext for MyUnionerContext { - fn task_ctx(&self) -> Arc { - self.ctx.task_ctx() - } - - fn session_state(&self) -> SessionState { - self.ctx.state() - } - - fn register_object_store( - &self, - url: &url::Url, - object_store: Arc, - ) -> Option> { - self.ctx.register_object_store(url, object_store) - } - - fn register_table_options_extension_from_scheme(&self, _scheme: &str) { - unimplemented!() - } - - async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result { - let new_plan = LogicalPlanBuilder::from(plan.clone()) - .union(plan.clone())? - .build()?; - - self.ctx.execute_logical_plan(new_plan).await - } -} - -#[tokio::main] -/// Runs the example. -pub async fn main() { - let my_ctx = MyUnionerContext::default(); - - let mut print_options = PrintOptions { - format: optd_datafusion_cli::print_format::PrintFormat::Automatic, - quiet: false, - maxrows: optd_datafusion_cli::print_options::MaxRows::Unlimited, - color: true, - }; - - exec_from_repl(&my_ctx, &mut print_options).await.unwrap(); -} diff --git a/optd-datafusion-cli/src/catalog.rs b/optd-datafusion-cli/src/catalog.rs deleted file mode 100644 index 3755571..0000000 --- a/optd-datafusion-cli/src/catalog.rs +++ /dev/null @@ -1,365 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::sync::{Arc, Weak}; - -use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; - -use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; - -use datafusion::common::plan_datafusion_err; -use datafusion::datasource::listing::ListingTableUrl; -use datafusion::datasource::TableProvider; -use datafusion::error::Result; -use datafusion::execution::context::SessionState; -use datafusion::execution::session_state::SessionStateBuilder; - -use async_trait::async_trait; -use dirs::home_dir; -use parking_lot::RwLock; - -/// Wraps another catalog, automatically register require object stores for the file locations -#[derive(Debug)] -pub struct DynamicObjectStoreCatalog { - inner: Arc, - state: Weak>, -} - -impl DynamicObjectStoreCatalog { - pub fn new(inner: Arc, state: Weak>) -> Self { - Self { inner, state } - } -} - -impl CatalogProviderList for DynamicObjectStoreCatalog { - fn as_any(&self) -> &dyn Any { - self - } - - fn register_catalog( - &self, - name: String, - catalog: Arc, - ) -> Option> { - self.inner.register_catalog(name, catalog) - } - - fn catalog_names(&self) -> Vec { - self.inner.catalog_names() - } - - fn catalog(&self, name: &str) -> Option> { - let state = self.state.clone(); - self.inner - .catalog(name) - .map(|catalog| Arc::new(DynamicObjectStoreCatalogProvider::new(catalog, state)) as _) - } -} - -/// Wraps another catalog provider -#[derive(Debug)] -struct DynamicObjectStoreCatalogProvider { - inner: Arc, - state: Weak>, -} - -impl DynamicObjectStoreCatalogProvider { - pub fn new(inner: Arc, state: Weak>) -> Self { - Self { inner, state } - } -} - -impl CatalogProvider for DynamicObjectStoreCatalogProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema_names(&self) -> Vec { - self.inner.schema_names() - } - - fn schema(&self, name: &str) -> Option> { - let state = self.state.clone(); - self.inner - .schema(name) - .map(|schema| Arc::new(DynamicObjectStoreSchemaProvider::new(schema, state)) as _) - } - - fn register_schema( - &self, - name: &str, - schema: Arc, - ) -> Result>> { - self.inner.register_schema(name, schema) - } -} - -/// Wraps another schema provider. [DynamicObjectStoreSchemaProvider] is responsible for registering the required -/// object stores for the file locations. -#[derive(Debug)] -struct DynamicObjectStoreSchemaProvider { - inner: Arc, - state: Weak>, -} - -impl DynamicObjectStoreSchemaProvider { - pub fn new(inner: Arc, state: Weak>) -> Self { - Self { inner, state } - } -} - -#[async_trait] -impl SchemaProvider for DynamicObjectStoreSchemaProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn table_names(&self) -> Vec { - self.inner.table_names() - } - - fn register_table( - &self, - name: String, - table: Arc, - ) -> Result>> { - self.inner.register_table(name, table) - } - - async fn table(&self, name: &str) -> Result>> { - let inner_table = self.inner.table(name).await; - if inner_table.is_ok() { - if let Some(inner_table) = inner_table? { - return Ok(Some(inner_table)); - } - } - - // if the inner schema provider didn't have a table by - // that name, try to treat it as a listing table - let mut state = self - .state - .upgrade() - .ok_or_else(|| plan_datafusion_err!("locking error"))? - .read() - .clone(); - let mut builder = SessionStateBuilder::from(state.clone()); - let optimized_name = substitute_tilde(name.to_owned()); - let table_url = ListingTableUrl::parse(optimized_name.as_str())?; - let scheme = table_url.scheme(); - let url = table_url.as_ref(); - - // If the store is already registered for this URL then `get_store` - // will return `Ok` which means we don't need to register it again. However, - // if `get_store` returns an `Err` then it means the corresponding store is - // not registered yet and we need to register it - match state.runtime_env().object_store_registry.get_store(url) { - Ok(_) => { /*Nothing to do here, store for this URL is already registered*/ } - Err(_) => { - // Register the store for this URL. Here we don't have access - // to any command options so the only choice is to use an empty collection - match scheme { - "s3" | "oss" | "cos" => { - if let Some(table_options) = builder.table_options() { - table_options.extensions.insert(AwsOptions::default()) - } - } - "gs" | "gcs" => { - if let Some(table_options) = builder.table_options() { - table_options.extensions.insert(GcpOptions::default()) - } - } - _ => {} - }; - state = builder.build(); - let store = get_object_store( - &state, - table_url.scheme(), - url, - &state.default_table_options(), - ) - .await?; - state.runtime_env().register_object_store(url, store); - } - } - self.inner.table(name).await - } - - fn deregister_table(&self, name: &str) -> Result>> { - self.inner.deregister_table(name) - } - - fn table_exist(&self, name: &str) -> bool { - self.inner.table_exist(name) - } -} - -pub fn substitute_tilde(cur: String) -> String { - if let Some(usr_dir_path) = home_dir() { - if let Some(usr_dir) = usr_dir_path.to_str() { - if cur.starts_with('~') && !usr_dir.is_empty() { - return cur.replacen('~', usr_dir, 1); - } - } - } - cur -} -#[cfg(test)] -mod tests { - - use super::*; - - use datafusion::catalog::SchemaProvider; - use datafusion::prelude::SessionContext; - - fn setup_context() -> (SessionContext, Arc) { - let ctx = SessionContext::new(); - ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( - ctx.state().catalog_list().clone(), - ctx.state_weak_ref(), - ))); - - let provider = &DynamicObjectStoreCatalog::new( - ctx.state().catalog_list().clone(), - ctx.state_weak_ref(), - ) as &dyn CatalogProviderList; - let catalog = provider - .catalog(provider.catalog_names().first().unwrap()) - .unwrap(); - let schema = catalog - .schema(catalog.schema_names().first().unwrap()) - .unwrap(); - (ctx, schema) - } - - #[tokio::test] - async fn query_http_location_test() -> Result<()> { - // This is a unit test so not expecting a connection or a file to be - // available - let domain = "example.com"; - let location = format!("http://{domain}/file.parquet"); - - let (ctx, schema) = setup_context(); - - // That's a non registered table so expecting None here - let table = schema.table(&location).await?; - assert!(table.is_none()); - - // It should still create an object store for the location in the SessionState - let store = ctx - .runtime_env() - .object_store(ListingTableUrl::parse(location)?)?; - - assert_eq!(format!("{store}"), "HttpStore"); - - // The store must be configured for this domain - let expected_domain = format!("Domain(\"{domain}\")"); - assert!(format!("{store:?}").contains(&expected_domain)); - - Ok(()) - } - - #[tokio::test] - async fn query_s3_location_test() -> Result<()> { - let bucket = "examples3bucket"; - let location = format!("s3://{bucket}/file.parquet"); - - let (ctx, schema) = setup_context(); - - let table = schema.table(&location).await?; - assert!(table.is_none()); - - let store = ctx - .runtime_env() - .object_store(ListingTableUrl::parse(location)?)?; - assert_eq!(format!("{store}"), format!("AmazonS3({bucket})")); - - // The store must be configured for this domain - let expected_bucket = format!("bucket: \"{bucket}\""); - assert!(format!("{store:?}").contains(&expected_bucket)); - - Ok(()) - } - - #[tokio::test] - async fn query_gs_location_test() -> Result<()> { - let bucket = "examplegsbucket"; - let location = format!("gs://{bucket}/file.parquet"); - - let (ctx, schema) = setup_context(); - - let table = schema.table(&location).await?; - assert!(table.is_none()); - - let store = ctx - .runtime_env() - .object_store(ListingTableUrl::parse(location)?)?; - assert_eq!(format!("{store}"), format!("GoogleCloudStorage({bucket})")); - - // The store must be configured for this domain - let expected_bucket = format!("bucket_name_encoded: \"{bucket}\""); - assert!(format!("{store:?}").contains(&expected_bucket)); - - Ok(()) - } - - #[tokio::test] - async fn query_invalid_location_test() { - let location = "ts://file.parquet"; - let (_ctx, schema) = setup_context(); - - assert!(schema.table(location).await.is_err()); - } - - #[cfg(not(target_os = "windows"))] - #[test] - fn test_substitute_tilde() { - use std::env; - use std::path::MAIN_SEPARATOR; - let original_home = home_dir(); - let test_home_path = if cfg!(windows) { - "C:\\Users\\user" - } else { - "/home/user" - }; - env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - test_home_path, - ); - let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet"; - let expected = format!( - "{}{}Code{}datafusion{}benchmarks{}data{}tpch_sf1{}part{}part-0.parquet", - test_home_path, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR - ); - let actual = substitute_tilde(input.to_string()); - assert_eq!(actual, expected); - match original_home { - Some(home_path) => env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - home_path.to_str().unwrap(), - ), - None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }), - } - } -} diff --git a/optd-datafusion-cli/src/cli_context.rs b/optd-datafusion-cli/src/cli_context.rs deleted file mode 100644 index d3c705e..0000000 --- a/optd-datafusion-cli/src/cli_context.rs +++ /dev/null @@ -1,92 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use datafusion::{ - dataframe::DataFrame, - error::DataFusionError, - execution::{context::SessionState, TaskContext}, - logical_expr::LogicalPlan, - prelude::SessionContext, -}; -use object_store::ObjectStore; - -use crate::object_storage::{AwsOptions, GcpOptions}; - -#[async_trait::async_trait] -/// The CLI session context trait provides a way to have a session context that can be used with datafusion's CLI code. -pub trait CliSessionContext { - /// Get an atomic reference counted task context. - fn task_ctx(&self) -> Arc; - - /// Get the session state. - fn session_state(&self) -> SessionState; - - /// Register an object store with the session context. - fn register_object_store( - &self, - url: &url::Url, - object_store: Arc, - ) -> Option>; - - /// Register table options extension from scheme. - fn register_table_options_extension_from_scheme(&self, scheme: &str); - - /// Execute a logical plan and return a DataFrame. - async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result; -} - -#[async_trait::async_trait] -impl CliSessionContext for SessionContext { - fn task_ctx(&self) -> Arc { - self.task_ctx() - } - - fn session_state(&self) -> SessionState { - self.state() - } - - fn register_object_store( - &self, - url: &url::Url, - object_store: Arc, - ) -> Option> { - self.register_object_store(url, object_store) - } - - fn register_table_options_extension_from_scheme(&self, scheme: &str) { - match scheme { - // For Amazon S3 or Alibaba Cloud OSS - "s3" | "oss" | "cos" => { - // Register AWS specific table options in the session context: - self.register_table_options_extension(AwsOptions::default()) - } - // For Google Cloud Storage - "gs" | "gcs" => { - // Register GCP specific table options in the session context: - self.register_table_options_extension(GcpOptions::default()) - } - // For unsupported schemes, do nothing: - _ => {} - } - } - - async fn execute_logical_plan(&self, plan: LogicalPlan) -> Result { - self.execute_logical_plan(plan).await - } -} diff --git a/optd-datafusion-cli/src/command.rs b/optd-datafusion-cli/src/command.rs deleted file mode 100644 index 54942b9..0000000 --- a/optd-datafusion-cli/src/command.rs +++ /dev/null @@ -1,222 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Command within CLI - -use crate::cli_context::CliSessionContext; -use crate::exec::{exec_and_print, exec_from_lines}; -use crate::functions::{display_all_functions, Function}; -use crate::print_format::PrintFormat; -use crate::print_options::PrintOptions; -use clap::ValueEnum; -use datafusion::arrow::array::{ArrayRef, StringArray}; -use datafusion::arrow::datatypes::{DataType, Field, Schema}; -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::common::exec_err; -use datafusion::common::instant::Instant; -use datafusion::error::{DataFusionError, Result}; -use std::fs::File; -use std::io::BufReader; -use std::str::FromStr; -use std::sync::Arc; - -/// Command -#[derive(Debug)] -pub enum Command { - Quit, - Help, - ListTables, - DescribeTableStmt(String), - ListFunctions, - Include(Option), - SearchFunctions(String), - QuietMode(Option), - OutputFormat(Option), -} - -pub enum OutputFormat { - ChangeFormat(String), -} - -impl Command { - pub async fn execute( - &self, - ctx: &dyn CliSessionContext, - print_options: &mut PrintOptions, - ) -> Result<()> { - match self { - Self::Help => { - let now = Instant::now(); - let command_batch = all_commands_info(); - print_options.print_batches(command_batch.schema(), &[command_batch], now) - } - Self::ListTables => exec_and_print(ctx, print_options, "SHOW TABLES".into()).await, - Self::DescribeTableStmt(name) => { - exec_and_print(ctx, print_options, format!("SHOW COLUMNS FROM {}", name)).await - } - Self::Include(filename) => { - if let Some(filename) = filename { - let file = File::open(filename).map_err(|e| { - DataFusionError::Execution(format!("Error opening {:?} {}", filename, e)) - })?; - exec_from_lines(ctx, &mut BufReader::new(file), print_options).await?; - Ok(()) - } else { - exec_err!("Required filename argument is missing") - } - } - Self::QuietMode(quiet) => { - if let Some(quiet) = quiet { - print_options.quiet = *quiet; - println!( - "Quiet mode set to {}", - if print_options.quiet { "true" } else { "false" } - ); - } else { - println!( - "Quiet mode is {}", - if print_options.quiet { "true" } else { "false" } - ); - } - Ok(()) - } - Self::Quit => exec_err!("Unexpected quit, this should be handled outside"), - Self::ListFunctions => display_all_functions(), - Self::SearchFunctions(function) => { - if let Ok(func) = function.parse::() { - let details = func.function_details()?; - println!("{}", details); - Ok(()) - } else { - exec_err!("{function} is not a supported function") - } - } - Self::OutputFormat(_) => { - exec_err!("Unexpected change output format, this should be handled outside") - } - } - } - - fn get_name_and_description(&self) -> (&'static str, &'static str) { - match self { - Self::Quit => ("\\q", "quit datafusion-cli"), - Self::ListTables => ("\\d", "list tables"), - Self::DescribeTableStmt(_) => ("\\d name", "describe table"), - Self::Help => ("\\?", "help"), - Self::Include(_) => ("\\i filename", "reads input from the specified filename"), - Self::ListFunctions => ("\\h", "function list"), - Self::SearchFunctions(_) => ("\\h function", "search function"), - Self::QuietMode(_) => ("\\quiet (true|false)?", "print or set quiet mode"), - Self::OutputFormat(_) => ("\\pset [NAME [VALUE]]", "set table output option\n(format)"), - } - } -} - -const ALL_COMMANDS: [Command; 9] = [ - Command::ListTables, - Command::DescribeTableStmt(String::new()), - Command::Quit, - Command::Help, - Command::Include(Some(String::new())), - Command::ListFunctions, - Command::SearchFunctions(String::new()), - Command::QuietMode(None), - Command::OutputFormat(None), -]; - -fn all_commands_info() -> RecordBatch { - let schema = Arc::new(Schema::new(vec![ - Field::new("Command", DataType::Utf8, false), - Field::new("Description", DataType::Utf8, false), - ])); - let (names, description): (Vec<&str>, Vec<&str>) = ALL_COMMANDS - .into_iter() - .map(|c| c.get_name_and_description()) - .unzip(); - RecordBatch::try_new( - schema, - [names, description] - .into_iter() - .map(|i| Arc::new(StringArray::from(i)) as ArrayRef) - .collect::>(), - ) - .expect("This should not fail") -} - -impl FromStr for Command { - type Err = (); - - fn from_str(s: &str) -> Result { - let (c, arg) = if let Some((a, b)) = s.split_once(' ') { - (a, Some(b)) - } else { - (s, None) - }; - Ok(match (c, arg) { - ("q", None) => Self::Quit, - ("d", None) => Self::ListTables, - ("d", Some(name)) => Self::DescribeTableStmt(name.into()), - ("?", None) => Self::Help, - ("h", None) => Self::ListFunctions, - ("h", Some(function)) => Self::SearchFunctions(function.into()), - ("i", None) => Self::Include(None), - ("i", Some(filename)) => Self::Include(Some(filename.to_owned())), - ("quiet", Some("true" | "t" | "yes" | "y" | "on")) => Self::QuietMode(Some(true)), - ("quiet", Some("false" | "f" | "no" | "n" | "off")) => Self::QuietMode(Some(false)), - ("quiet", None) => Self::QuietMode(None), - ("pset", Some(subcommand)) => Self::OutputFormat(Some(subcommand.to_string())), - ("pset", None) => Self::OutputFormat(None), - _ => return Err(()), - }) - } -} - -impl FromStr for OutputFormat { - type Err = (); - - fn from_str(s: &str) -> Result { - let (c, arg) = if let Some((a, b)) = s.split_once(' ') { - (a, Some(b)) - } else { - (s, None) - }; - Ok(match (c, arg) { - ("format", Some(format)) => Self::ChangeFormat(format.to_string()), - _ => return Err(()), - }) - } -} - -impl OutputFormat { - pub async fn execute(&self, print_options: &mut PrintOptions) -> Result<()> { - match self { - Self::ChangeFormat(format) => { - if let Ok(format) = format.parse::() { - print_options.format = format; - println!("Output format is {:?}.", print_options.format); - Ok(()) - } else { - exec_err!( - "{:?} is not a valid format type [possible values: {:?}]", - format, - PrintFormat::value_variants() - ) - } - } - } - } -} diff --git a/optd-datafusion-cli/src/exec.rs b/optd-datafusion-cli/src/exec.rs deleted file mode 100644 index c724ba2..0000000 --- a/optd-datafusion-cli/src/exec.rs +++ /dev/null @@ -1,625 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Execution functions - -use std::collections::HashMap; -use std::fs::File; -use std::io::prelude::*; -use std::io::BufReader; - -use crate::cli_context::CliSessionContext; -use crate::helper::split_from_semicolon; -use crate::print_format::PrintFormat; -use crate::{ - command::{Command, OutputFormat}, - helper::{unescape_input, CliHelper}, - object_storage::get_object_store, - print_options::{MaxRows, PrintOptions}, -}; - -use datafusion::common::instant::Instant; -use datafusion::common::{plan_datafusion_err, plan_err}; -use datafusion::config::ConfigFileType; -use datafusion::datasource::listing::ListingTableUrl; -use datafusion::error::{DataFusionError, Result}; -use datafusion::logical_expr::{DdlStatement, LogicalPlan}; -use datafusion::physical_plan::execution_plan::EmissionType; -use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; -use datafusion::sql::parser::{DFParser, Statement}; -use datafusion::sql::sqlparser::dialect::dialect_from_str; - -use datafusion::sql::sqlparser; -use rustyline::error::ReadlineError; -use rustyline::Editor; -use tokio::signal; - -/// run and execute SQL statements and commands, against a context with the given print options -pub async fn exec_from_commands( - ctx: &dyn CliSessionContext, - commands: Vec, - print_options: &PrintOptions, -) -> Result<()> { - for sql in commands { - exec_and_print(ctx, print_options, sql).await?; - } - - Ok(()) -} - -/// run and execute SQL statements and commands from a file, against a context with the given print options -pub async fn exec_from_lines( - ctx: &dyn CliSessionContext, - reader: &mut BufReader, - print_options: &PrintOptions, -) -> Result<()> { - let mut query = "".to_owned(); - - for line in reader.lines() { - match line { - Ok(line) if line.starts_with("#!") => { - continue; - } - Ok(line) if line.starts_with("--") => { - continue; - } - Ok(line) => { - let line = line.trim_end(); - query.push_str(line); - if line.ends_with(';') { - match exec_and_print(ctx, print_options, query).await { - Ok(_) => {} - Err(err) => eprintln!("{err}"), - } - query = "".to_string(); - } else { - query.push('\n'); - } - } - _ => { - break; - } - } - } - - // run the left over query if the last statement doesn't contain ‘;’ - // ignore if it only consists of '\n' - if query.contains(|c| c != '\n') { - exec_and_print(ctx, print_options, query).await?; - } - - Ok(()) -} - -pub async fn exec_from_files( - ctx: &dyn CliSessionContext, - files: Vec, - print_options: &PrintOptions, -) -> Result<()> { - let files = files - .into_iter() - .map(|file_path| File::open(file_path).unwrap()) - .collect::>(); - - for file in files { - let mut reader = BufReader::new(file); - exec_from_lines(ctx, &mut reader, print_options).await?; - } - - Ok(()) -} - -/// run and execute SQL statements and commands against a context with the given print options -pub async fn exec_from_repl( - ctx: &dyn CliSessionContext, - print_options: &mut PrintOptions, -) -> rustyline::Result<()> { - let mut rl = Editor::new()?; - rl.set_helper(Some(CliHelper::new( - &ctx.task_ctx().session_config().options().sql_parser.dialect, - print_options.color, - ))); - rl.load_history(".history").ok(); - - loop { - match rl.readline("> ") { - Ok(line) if line.starts_with('\\') => { - rl.add_history_entry(line.trim_end())?; - let command = line.split_whitespace().collect::>().join(" "); - if let Ok(cmd) = &command[1..].parse::() { - match cmd { - Command::Quit => break, - Command::OutputFormat(subcommand) => { - if let Some(subcommand) = subcommand { - if let Ok(command) = subcommand.parse::() { - if let Err(e) = command.execute(print_options).await { - eprintln!("{e}") - } - } else { - eprintln!("'\\{}' is not a valid command", &line[1..]); - } - } else { - println!("Output format is {:?}.", print_options.format); - } - } - _ => { - if let Err(e) = cmd.execute(ctx, print_options).await { - eprintln!("{e}") - } - } - } - } else { - eprintln!("'\\{}' is not a valid command", &line[1..]); - } - } - Ok(line) => { - let lines = split_from_semicolon(line); - for line in lines { - rl.add_history_entry(line.trim_end())?; - tokio::select! { - res = exec_and_print(ctx, print_options, line) => match res { - Ok(_) => {} - Err(err) => eprintln!("{err}"), - }, - _ = signal::ctrl_c() => { - println!("^C"); - continue - }, - } - // dialect might have changed - rl.helper_mut() - .unwrap() - .set_dialect(&ctx.task_ctx().session_config().options().sql_parser.dialect); - } - } - Err(ReadlineError::Interrupted) => { - println!("^C"); - continue; - } - Err(ReadlineError::Eof) => { - println!("\\q"); - break; - } - Err(err) => { - eprintln!("Unknown error happened {:?}", err); - break; - } - } - } - - rl.save_history(".history") -} - -pub(super) async fn exec_and_print( - ctx: &dyn CliSessionContext, - print_options: &PrintOptions, - sql: String, -) -> Result<()> { - let now = Instant::now(); - let sql = unescape_input(&sql)?; - let task_ctx = ctx.task_ctx(); - let dialect = &task_ctx.session_config().options().sql_parser.dialect; - let dialect = dialect_from_str(dialect).ok_or_else(|| { - plan_datafusion_err!( - "Unsupported SQL dialect: {dialect}. Available dialects: \ - Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, \ - MsSQL, ClickHouse, BigQuery, Ansi." - ) - })?; - - let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; - for statement in statements { - let adjusted = AdjustedPrintOptions::new(print_options.clone()).with_statement(&statement); - - let plan = create_plan(ctx, statement).await?; - let adjusted = adjusted.with_plan(&plan); - - let df = ctx.execute_logical_plan(plan).await?; - let physical_plan = df.create_physical_plan().await?; - - if physical_plan.boundedness().is_unbounded() { - if physical_plan.pipeline_behavior() == EmissionType::Final { - return plan_err!( - "The given query can generate a valid result only once \ - the source finishes, but the source is unbounded" - ); - } - // As the input stream comes, we can generate results. - // However, memory safety is not guaranteed. - let stream = execute_stream(physical_plan, task_ctx.clone())?; - print_options.print_stream(stream, now).await?; - } else { - // Bounded stream; collected results are printed after all input consumed. - let schema = physical_plan.schema(); - let results = collect(physical_plan, task_ctx.clone()).await?; - adjusted.into_inner().print_batches(schema, &results, now)?; - } - } - - Ok(()) -} - -/// Track adjustments to the print options based on the plan / statement being executed -#[derive(Debug)] -struct AdjustedPrintOptions { - inner: PrintOptions, -} - -impl AdjustedPrintOptions { - fn new(inner: PrintOptions) -> Self { - Self { inner } - } - /// Adjust print options based on any statement specific requirements - fn with_statement(mut self, statement: &Statement) -> Self { - if let Statement::Statement(sql_stmt) = statement { - // SHOW / SHOW ALL - if let sqlparser::ast::Statement::ShowVariable { .. } = sql_stmt.as_ref() { - self.inner.maxrows = MaxRows::Unlimited - } - } - self - } - - /// Adjust print options based on any plan specific requirements - fn with_plan(mut self, plan: &LogicalPlan) -> Self { - // For plans like `Explain` ignore `MaxRows` option and always display - // all rows - if matches!( - plan, - LogicalPlan::Explain(_) | LogicalPlan::DescribeTable(_) | LogicalPlan::Analyze(_) - ) { - self.inner.maxrows = MaxRows::Unlimited; - } - self - } - - /// Finalize and return the inner `PrintOptions` - fn into_inner(mut self) -> PrintOptions { - if self.inner.format == PrintFormat::Automatic { - self.inner.format = PrintFormat::Table; - } - - self.inner - } -} - -fn config_file_type_from_str(ext: &str) -> Option { - match ext.to_lowercase().as_str() { - "csv" => Some(ConfigFileType::CSV), - "json" => Some(ConfigFileType::JSON), - "parquet" => Some(ConfigFileType::PARQUET), - _ => None, - } -} - -async fn create_plan( - ctx: &dyn CliSessionContext, - statement: Statement, -) -> Result { - let mut plan = ctx.session_state().statement_to_plan(statement).await?; - - // Note that cmd is a mutable reference so that create_external_table function can remove all - // datafusion-cli specific options before passing through to datafusion. Otherwise, datafusion - // will raise Configuration errors. - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { - // To support custom formats, treat error as None - let format = config_file_type_from_str(&cmd.file_type); - register_object_store_and_config_extensions(ctx, &cmd.location, &cmd.options, format) - .await?; - } - - if let LogicalPlan::Copy(copy_to) = &mut plan { - let format = config_file_type_from_str(©_to.file_type.get_ext()); - - register_object_store_and_config_extensions( - ctx, - ©_to.output_url, - ©_to.options, - format, - ) - .await?; - } - Ok(plan) -} - -/// Asynchronously registers an object store and its configuration extensions -/// to the session context. -/// -/// This function dynamically registers a cloud object store based on the given -/// location and options. It first parses the location to determine the scheme -/// and constructs the URL accordingly. Depending on the scheme, it also registers -/// relevant options. The function then alters the default table options with the -/// given custom options. Finally, it retrieves and registers the object store -/// in the session context. -/// -/// # Parameters -/// -/// * `ctx`: A reference to the `SessionContext` for registering the object store. -/// * `location`: A string reference representing the location of the object store. -/// * `options`: A reference to a hash map containing configuration options for -/// the object store. -/// -/// # Returns -/// -/// A `Result<()>` which is an Ok value indicating successful registration, or -/// an error upon failure. -/// -/// # Errors -/// -/// This function can return an error if the location parsing fails, options -/// alteration fails, or if the object store cannot be retrieved and registered -/// successfully. -pub(crate) async fn register_object_store_and_config_extensions( - ctx: &dyn CliSessionContext, - location: &String, - options: &HashMap, - format: Option, -) -> Result<()> { - // Parse the location URL to extract the scheme and other components - let table_path = ListingTableUrl::parse(location)?; - - // Extract the scheme (e.g., "s3", "gcs") from the parsed URL - let scheme = table_path.scheme(); - - // Obtain a reference to the URL - let url = table_path.as_ref(); - - // Register the options based on the scheme extracted from the location - ctx.register_table_options_extension_from_scheme(scheme); - - // Clone and modify the default table options based on the provided options - let mut table_options = ctx.session_state().default_table_options(); - if let Some(format) = format { - table_options.set_config_format(format); - } - table_options.alter_with_string_hash_map(options)?; - - // Retrieve the appropriate object store based on the scheme, URL, and modified table options - let store = get_object_store(&ctx.session_state(), scheme, url, &table_options).await?; - - // Register the retrieved object store in the session context's runtime environment - ctx.register_object_store(url, store); - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - use datafusion::common::plan_err; - - use datafusion::prelude::SessionContext; - use url::Url; - - async fn create_external_table_test(location: &str, sql: &str) -> Result<()> { - let ctx = SessionContext::new(); - let plan = ctx.state().create_logical_plan(sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &plan { - let format = config_file_type_from_str(&cmd.file_type); - register_object_store_and_config_extensions(&ctx, &cmd.location, &cmd.options, format) - .await?; - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - // Ensure the URL is supported by the object store - ctx.runtime_env() - .object_store(ListingTableUrl::parse(location)?)?; - - Ok(()) - } - - async fn copy_to_table_test(location: &str, sql: &str) -> Result<()> { - let ctx = SessionContext::new(); - // AWS CONFIG register. - - let plan = ctx.state().create_logical_plan(sql).await?; - - if let LogicalPlan::Copy(cmd) = &plan { - let format = config_file_type_from_str(&cmd.file_type.get_ext()); - register_object_store_and_config_extensions( - &ctx, - &cmd.output_url, - &cmd.options, - format, - ) - .await?; - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - // Ensure the URL is supported by the object store - ctx.runtime_env() - .object_store(ListingTableUrl::parse(location)?)?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_http() -> Result<()> { - // Should be OK - let location = "http://example.com/file.parquet"; - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - #[tokio::test] - async fn copy_to_external_object_store_test() -> Result<()> { - let locations = vec![ - "s3://bucket/path/file.parquet", - "oss://bucket/path/file.parquet", - "cos://bucket/path/file.parquet", - "gcs://bucket/path/file.parquet", - ]; - let ctx = SessionContext::new(); - let task_ctx = ctx.task_ctx(); - let dialect = &task_ctx.session_config().options().sql_parser.dialect; - let dialect = dialect_from_str(dialect).ok_or_else(|| { - plan_datafusion_err!( - "Unsupported SQL dialect: {dialect}. Available dialects: \ - Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, \ - MsSQL, ClickHouse, BigQuery, Ansi." - ) - })?; - for location in locations { - let sql = format!("copy (values (1,2)) to '{}' STORED AS PARQUET;", location); - let statements = DFParser::parse_sql_with_dialect(&sql, dialect.as_ref())?; - for statement in statements { - //Should not fail - let mut plan = create_plan(&ctx, statement).await?; - if let LogicalPlan::Copy(copy_to) = &mut plan { - assert_eq!(copy_to.output_url, location); - assert_eq!(copy_to.file_type.get_ext(), "parquet".to_string()); - ctx.runtime_env() - .object_store_registry - .get_store(&Url::parse(©_to.output_url).unwrap())?; - } else { - return plan_err!("LogicalPlan is not a CopyTo"); - } - } - } - Ok(()) - } - - #[tokio::test] - async fn copy_to_object_store_table_s3() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let location = "s3://bucket/path/file.parquet"; - - // Missing region, use object_store defaults - let sql = format!("COPY (values (1,2)) TO '{location}' STORED AS PARQUET - OPTIONS ('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}')"); - copy_to_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_s3() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let region = "fake_us-east-2"; - let session_token = "fake_session_token"; - let location = "s3://bucket/path/file.parquet"; - - // Missing region, use object_store defaults - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - // Should be OK - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.region' '{region}', 'aws.session_token' '{session_token}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_oss() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let endpoint = "fake_endpoint"; - let location = "oss://bucket/path/file.parquet"; - - // Should be OK - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_cos() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let endpoint = "fake_endpoint"; - let location = "cos://bucket/path/file.parquet"; - - // Should be OK - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.cos.endpoint' '{endpoint}') LOCATION '{location}'"); - create_external_table_test(location, &sql).await?; - - Ok(()) - } - - #[tokio::test] - async fn create_object_store_table_gcs() -> Result<()> { - let service_account_path = "fake_service_account_path"; - let service_account_key = - "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\", \"private_key_id\":\"id\"}"; - let application_credentials_path = "fake_application_credentials_path"; - let location = "gcs://bucket/path/file.parquet"; - - // for service_account_path - let sql = format!( - "CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('gcp.service_account_path' '{service_account_path}') LOCATION '{location}'" - ); - let err = create_external_table_test(location, &sql) - .await - .unwrap_err(); - assert!(err.to_string().contains("os error 2")); - - // for service_account_key - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_key' '{service_account_key}') LOCATION '{location}'"); - let err = create_external_table_test(location, &sql) - .await - .unwrap_err() - .to_string(); - assert!(err.contains("No RSA key found in pem file"), "{err}"); - - // for application_credentials_path - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET - OPTIONS('gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); - let err = create_external_table_test(location, &sql) - .await - .unwrap_err(); - assert!(err.to_string().contains("os error 2")); - - Ok(()) - } - - #[tokio::test] - async fn create_external_table_local_file() -> Result<()> { - let location = "path/to/file.parquet"; - - // Ensure that local files are also registered - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'"); - create_external_table_test(location, &sql).await.unwrap(); - - Ok(()) - } - - #[tokio::test] - async fn create_external_table_format_option() -> Result<()> { - let location = "path/to/file.cvs"; - - // Test with format options - let sql = - format!("CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '{location}' OPTIONS('format.has_header' 'true')"); - create_external_table_test(location, &sql).await.unwrap(); - - Ok(()) - } -} diff --git a/optd-datafusion-cli/src/functions.rs b/optd-datafusion-cli/src/functions.rs deleted file mode 100644 index 6bb3cee..0000000 --- a/optd-datafusion-cli/src/functions.rs +++ /dev/null @@ -1,457 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Functions that are query-able and searchable via the `\h` command -use arrow::array::{Int64Array, StringArray}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow::record_batch::RecordBatch; -use arrow::util::pretty::pretty_format_batches; -use async_trait::async_trait; - -use datafusion::catalog::Session; -use datafusion::common::{plan_err, Column}; -use datafusion::datasource::TableProvider; -use datafusion::error::Result; -use datafusion::logical_expr::Expr; -use datafusion::physical_plan::memory::MemoryExec; -use datafusion::physical_plan::ExecutionPlan; -use datafusion::scalar::ScalarValue; -use datafusion_catalog::TableFunctionImpl; -use parquet::basic::ConvertedType; -use parquet::data_type::{ByteArray, FixedLenByteArray}; -use parquet::file::reader::FileReader; -use parquet::file::serialized_reader::SerializedFileReader; -use parquet::file::statistics::Statistics; -use std::fmt; -use std::fs::File; -use std::str::FromStr; -use std::sync::Arc; - -#[derive(Debug)] -pub enum Function { - Select, - Explain, - Show, - CreateTable, - CreateTableAs, - Insert, - DropTable, -} - -const ALL_FUNCTIONS: [Function; 7] = [ - Function::CreateTable, - Function::CreateTableAs, - Function::DropTable, - Function::Explain, - Function::Insert, - Function::Select, - Function::Show, -]; - -impl Function { - pub fn function_details(&self) -> Result<&str> { - let details = match self { - Function::Select => { - r#" -Command: SELECT -Description: retrieve rows from a table or view -Syntax: -SELECT [ ALL | DISTINCT [ ON ( expression [, ...] ) ] ] - [ * | expression [ [ AS ] output_name ] [, ...] ] - [ FROM from_item [, ...] ] - [ WHERE condition ] - [ GROUP BY [ ALL | DISTINCT ] grouping_element [, ...] ] - [ HAVING condition ] - [ WINDOW window_name AS ( window_definition ) [, ...] ] - [ { UNION | INTERSECT | EXCEPT } [ ALL | DISTINCT ] select ] - [ ORDER BY expression [ ASC | DESC | USING operator ] [ NULLS { FIRST | LAST } ] [, ...] ] - [ LIMIT { count | ALL } ] - [ OFFSET start [ ROW | ROWS ] ] - -where from_item can be one of: - - [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] - [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] - with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - [ LATERAL ] function_name ( [ argument [, ...] ] ) - [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - [ LATERAL ] function_name ( [ argument [, ...] ] ) [ AS ] alias ( column_definition [, ...] ) - [ LATERAL ] function_name ( [ argument [, ...] ] ) AS ( column_definition [, ...] ) - [ LATERAL ] ROWS FROM( function_name ( [ argument [, ...] ] ) [ AS ( column_definition [, ...] ) ] [, ...] ) - [ WITH ORDINALITY ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] - from_item [ NATURAL ] join_type from_item [ ON join_condition | USING ( join_column [, ...] ) [ AS join_using_alias ] ] - -and grouping_element can be one of: - - ( ) - expression - ( expression [, ...] ) - -and with_query is: - - with_query_name [ ( column_name [, ...] ) ] AS [ [ NOT ] MATERIALIZED ] ( select | values | insert | update | delete ) - -TABLE [ ONLY ] table_name [ * ]"# - } - Function::Explain => { - r#" -Command: EXPLAIN -Description: show the execution plan of a statement -Syntax: -EXPLAIN [ ANALYZE ] statement -"# - } - Function::Show => { - r#" -Command: SHOW -Description: show the value of a run-time parameter -Syntax: -SHOW name -"# - } - Function::CreateTable => { - r#" -Command: CREATE TABLE -Description: define a new table -Syntax: -CREATE [ EXTERNAL ] TABLE table_name ( [ - { column_name data_type } - [, ... ] -] ) -"# - } - Function::CreateTableAs => { - r#" -Command: CREATE TABLE AS -Description: define a new table from the results of a query -Syntax: -CREATE TABLE table_name - [ (column_name [, ...] ) ] - AS query - [ WITH [ NO ] DATA ] -"# - } - Function::Insert => { - r#" -Command: INSERT -Description: create new rows in a table -Syntax: -INSERT INTO table_name [ ( column_name [, ...] ) ] - { VALUES ( { expression } [, ...] ) [, ...] } -"# - } - Function::DropTable => { - r#" -Command: DROP TABLE -Description: remove a table -Syntax: -DROP TABLE [ IF EXISTS ] name [, ...] -"# - } - }; - Ok(details) - } -} - -impl FromStr for Function { - type Err = (); - - fn from_str(s: &str) -> Result { - Ok(match s.trim().to_uppercase().as_str() { - "SELECT" => Self::Select, - "EXPLAIN" => Self::Explain, - "SHOW" => Self::Show, - "CREATE TABLE" => Self::CreateTable, - "CREATE TABLE AS" => Self::CreateTableAs, - "INSERT" => Self::Insert, - "DROP TABLE" => Self::DropTable, - _ => return Err(()), - }) - } -} - -impl fmt::Display for Function { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - Function::Select => write!(f, "SELECT"), - Function::Explain => write!(f, "EXPLAIN"), - Function::Show => write!(f, "SHOW"), - Function::CreateTable => write!(f, "CREATE TABLE"), - Function::CreateTableAs => write!(f, "CREATE TABLE AS"), - Function::Insert => write!(f, "INSERT"), - Function::DropTable => write!(f, "DROP TABLE"), - } - } -} - -pub fn display_all_functions() -> Result<()> { - println!("Available help:"); - let array = StringArray::from( - ALL_FUNCTIONS - .iter() - .map(|f| format!("{}", f)) - .collect::>(), - ); - let schema = Schema::new(vec![Field::new("Function", DataType::Utf8, false)]); - let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)])?; - println!("{}", pretty_format_batches(&[batch]).unwrap()); - Ok(()) -} - -/// PARQUET_META table function -#[derive(Debug)] -struct ParquetMetadataTable { - schema: SchemaRef, - batch: RecordBatch, -} - -#[async_trait] -impl TableProvider for ParquetMetadataTable { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn schema(&self) -> arrow::datatypes::SchemaRef { - self.schema.clone() - } - - fn table_type(&self) -> datafusion::logical_expr::TableType { - datafusion::logical_expr::TableType::Base - } - - async fn scan( - &self, - _state: &dyn Session, - projection: Option<&Vec>, - _filters: &[Expr], - _limit: Option, - ) -> Result> { - Ok(Arc::new(MemoryExec::try_new( - &[vec![self.batch.clone()]], - TableProvider::schema(self), - projection.cloned(), - )?)) - } -} - -fn convert_parquet_statistics( - value: &Statistics, - converted_type: ConvertedType, -) -> (Option, Option) { - match (value, converted_type) { - (Statistics::Boolean(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Int32(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Int64(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Int96(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Float(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::Double(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::ByteArray(val), ConvertedType::UTF8) => ( - byte_array_to_string(val.min_opt()), - byte_array_to_string(val.max_opt()), - ), - (Statistics::ByteArray(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - (Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => ( - fixed_len_byte_array_to_string(val.min_opt()), - fixed_len_byte_array_to_string(val.max_opt()), - ), - (Statistics::FixedLenByteArray(val), _) => ( - val.min_opt().map(|v| v.to_string()), - val.max_opt().map(|v| v.to_string()), - ), - } -} - -/// Convert to a string if it has utf8 encoding, otherwise print bytes directly -fn byte_array_to_string(val: Option<&ByteArray>) -> Option { - val.map(|v| { - v.as_utf8() - .map(|s| s.to_string()) - .unwrap_or_else(|_e| v.to_string()) - }) -} - -/// Convert to a string if it has utf8 encoding, otherwise print bytes directly -fn fixed_len_byte_array_to_string(val: Option<&FixedLenByteArray>) -> Option { - val.map(|v| { - v.as_utf8() - .map(|s| s.to_string()) - .unwrap_or_else(|_e| v.to_string()) - }) -} - -#[derive(Debug)] -pub struct ParquetMetadataFunc {} - -impl TableFunctionImpl for ParquetMetadataFunc { - fn call(&self, exprs: &[Expr]) -> Result> { - let filename = match exprs.first() { - Some(Expr::Literal(ScalarValue::Utf8(Some(s)))) => s, // single quote: parquet_metadata('x.parquet') - Some(Expr::Column(Column { name, .. })) => name, // double quote: parquet_metadata("x.parquet") - _ => { - return plan_err!("parquet_metadata requires string argument as its input"); - } - }; - - let file = File::open(filename.clone())?; - let reader = SerializedFileReader::new(file)?; - let metadata = reader.metadata(); - - let schema = Arc::new(Schema::new(vec![ - Field::new("filename", DataType::Utf8, true), - Field::new("row_group_id", DataType::Int64, true), - Field::new("row_group_num_rows", DataType::Int64, true), - Field::new("row_group_num_columns", DataType::Int64, true), - Field::new("row_group_bytes", DataType::Int64, true), - Field::new("column_id", DataType::Int64, true), - Field::new("file_offset", DataType::Int64, true), - Field::new("num_values", DataType::Int64, true), - Field::new("path_in_schema", DataType::Utf8, true), - Field::new("type", DataType::Utf8, true), - Field::new("stats_min", DataType::Utf8, true), - Field::new("stats_max", DataType::Utf8, true), - Field::new("stats_null_count", DataType::Int64, true), - Field::new("stats_distinct_count", DataType::Int64, true), - Field::new("stats_min_value", DataType::Utf8, true), - Field::new("stats_max_value", DataType::Utf8, true), - Field::new("compression", DataType::Utf8, true), - Field::new("encodings", DataType::Utf8, true), - Field::new("index_page_offset", DataType::Int64, true), - Field::new("dictionary_page_offset", DataType::Int64, true), - Field::new("data_page_offset", DataType::Int64, true), - Field::new("total_compressed_size", DataType::Int64, true), - Field::new("total_uncompressed_size", DataType::Int64, true), - ])); - - // construct record batch from metadata - let mut filename_arr = vec![]; - let mut row_group_id_arr = vec![]; - let mut row_group_num_rows_arr = vec![]; - let mut row_group_num_columns_arr = vec![]; - let mut row_group_bytes_arr = vec![]; - let mut column_id_arr = vec![]; - let mut file_offset_arr = vec![]; - let mut num_values_arr = vec![]; - let mut path_in_schema_arr = vec![]; - let mut type_arr = vec![]; - let mut stats_min_arr = vec![]; - let mut stats_max_arr = vec![]; - let mut stats_null_count_arr = vec![]; - let mut stats_distinct_count_arr = vec![]; - let mut stats_min_value_arr = vec![]; - let mut stats_max_value_arr = vec![]; - let mut compression_arr = vec![]; - let mut encodings_arr = vec![]; - let mut index_page_offset_arr = vec![]; - let mut dictionary_page_offset_arr = vec![]; - let mut data_page_offset_arr = vec![]; - let mut total_compressed_size_arr = vec![]; - let mut total_uncompressed_size_arr = vec![]; - for (rg_idx, row_group) in metadata.row_groups().iter().enumerate() { - for (col_idx, column) in row_group.columns().iter().enumerate() { - filename_arr.push(filename.clone()); - row_group_id_arr.push(rg_idx as i64); - row_group_num_rows_arr.push(row_group.num_rows()); - row_group_num_columns_arr.push(row_group.num_columns() as i64); - row_group_bytes_arr.push(row_group.total_byte_size()); - column_id_arr.push(col_idx as i64); - file_offset_arr.push(column.file_offset()); - num_values_arr.push(column.num_values()); - path_in_schema_arr.push(column.column_path().to_string()); - type_arr.push(column.column_type().to_string()); - let converted_type = column.column_descr().converted_type(); - - if let Some(s) = column.statistics() { - let (min_val, max_val) = convert_parquet_statistics(s, converted_type); - stats_min_arr.push(min_val.clone()); - stats_max_arr.push(max_val.clone()); - stats_null_count_arr.push(s.null_count_opt().map(|c| c as i64)); - stats_distinct_count_arr.push(s.distinct_count_opt().map(|c| c as i64)); - stats_min_value_arr.push(min_val); - stats_max_value_arr.push(max_val); - } else { - stats_min_arr.push(None); - stats_max_arr.push(None); - stats_null_count_arr.push(None); - stats_distinct_count_arr.push(None); - stats_min_value_arr.push(None); - stats_max_value_arr.push(None); - }; - compression_arr.push(format!("{:?}", column.compression())); - encodings_arr.push(format!("{:?}", column.encodings())); - index_page_offset_arr.push(column.index_page_offset()); - dictionary_page_offset_arr.push(column.dictionary_page_offset()); - data_page_offset_arr.push(column.data_page_offset()); - total_compressed_size_arr.push(column.compressed_size()); - total_uncompressed_size_arr.push(column.uncompressed_size()); - } - } - - let rb = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(StringArray::from(filename_arr)), - Arc::new(Int64Array::from(row_group_id_arr)), - Arc::new(Int64Array::from(row_group_num_rows_arr)), - Arc::new(Int64Array::from(row_group_num_columns_arr)), - Arc::new(Int64Array::from(row_group_bytes_arr)), - Arc::new(Int64Array::from(column_id_arr)), - Arc::new(Int64Array::from(file_offset_arr)), - Arc::new(Int64Array::from(num_values_arr)), - Arc::new(StringArray::from(path_in_schema_arr)), - Arc::new(StringArray::from(type_arr)), - Arc::new(StringArray::from(stats_min_arr)), - Arc::new(StringArray::from(stats_max_arr)), - Arc::new(Int64Array::from(stats_null_count_arr)), - Arc::new(Int64Array::from(stats_distinct_count_arr)), - Arc::new(StringArray::from(stats_min_value_arr)), - Arc::new(StringArray::from(stats_max_value_arr)), - Arc::new(StringArray::from(compression_arr)), - Arc::new(StringArray::from(encodings_arr)), - Arc::new(Int64Array::from(index_page_offset_arr)), - Arc::new(Int64Array::from(dictionary_page_offset_arr)), - Arc::new(Int64Array::from(data_page_offset_arr)), - Arc::new(Int64Array::from(total_compressed_size_arr)), - Arc::new(Int64Array::from(total_uncompressed_size_arr)), - ], - )?; - - let parquet_metadata = ParquetMetadataTable { schema, batch: rb }; - Ok(Arc::new(parquet_metadata)) - } -} diff --git a/optd-datafusion-cli/src/helper.rs b/optd-datafusion-cli/src/helper.rs deleted file mode 100644 index a5542ee..0000000 --- a/optd-datafusion-cli/src/helper.rs +++ /dev/null @@ -1,378 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Helper that helps with interactive editing, including multi-line parsing and validation, -//! and auto-completion for file name during creating external table. - -use std::borrow::Cow; - -use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; - -use datafusion::common::sql_datafusion_err; -use datafusion::error::DataFusionError; -use datafusion::sql::parser::{DFParser, Statement}; -use datafusion::sql::sqlparser::dialect::dialect_from_str; -use datafusion::sql::sqlparser::parser::ParserError; - -use rustyline::completion::{Completer, FilenameCompleter, Pair}; -use rustyline::error::ReadlineError; -use rustyline::highlight::{CmdKind, Highlighter}; -use rustyline::hint::Hinter; -use rustyline::validate::{ValidationContext, ValidationResult, Validator}; -use rustyline::{Context, Helper, Result}; - -pub struct CliHelper { - completer: FilenameCompleter, - dialect: String, - highlighter: Box, -} - -impl CliHelper { - pub fn new(dialect: &str, color: bool) -> Self { - let highlighter: Box = if !color { - Box::new(NoSyntaxHighlighter {}) - } else { - Box::new(SyntaxHighlighter::new(dialect)) - }; - Self { - completer: FilenameCompleter::new(), - dialect: dialect.into(), - highlighter, - } - } - - pub fn set_dialect(&mut self, dialect: &str) { - if dialect != self.dialect { - self.dialect = dialect.to_string(); - } - } - - fn validate_input(&self, input: &str) -> Result { - if let Some(sql) = input.strip_suffix(';') { - let sql = match unescape_input(sql) { - Ok(sql) => sql, - Err(err) => { - return Ok(ValidationResult::Invalid(Some(format!( - " 🤔 Invalid statement: {err}", - )))) - } - }; - - let dialect = match dialect_from_str(&self.dialect) { - Some(dialect) => dialect, - None => { - return Ok(ValidationResult::Invalid(Some(format!( - " 🤔 Invalid dialect: {}", - self.dialect - )))) - } - }; - let lines = split_from_semicolon(sql); - for line in lines { - match DFParser::parse_sql_with_dialect(&line, dialect.as_ref()) { - Ok(statements) if statements.is_empty() => { - return Ok(ValidationResult::Invalid(Some( - " 🤔 You entered an empty statement".to_string(), - ))); - } - Ok(_statements) => {} - Err(err) => { - return Ok(ValidationResult::Invalid(Some(format!( - " 🤔 Invalid statement: {err}", - )))); - } - } - } - Ok(ValidationResult::Valid(None)) - } else if input.starts_with('\\') { - // command - Ok(ValidationResult::Valid(None)) - } else { - Ok(ValidationResult::Incomplete) - } - } -} - -impl Default for CliHelper { - fn default() -> Self { - Self::new("generic", false) - } -} - -impl Highlighter for CliHelper { - fn highlight<'l>(&self, line: &'l str, pos: usize) -> Cow<'l, str> { - self.highlighter.highlight(line, pos) - } - - fn highlight_char(&self, line: &str, pos: usize, kind: CmdKind) -> bool { - self.highlighter.highlight_char(line, pos, kind) - } -} - -impl Hinter for CliHelper { - type Hint = String; -} - -/// returns true if the current position is after the open quote for -/// creating an external table. -fn is_open_quote_for_location(line: &str, pos: usize) -> bool { - let mut sql = line[..pos].to_string(); - sql.push('\''); - if let Ok(stmts) = DFParser::parse_sql(&sql) { - if let Some(Statement::CreateExternalTable(_)) = stmts.back() { - return true; - } - } - false -} - -impl Completer for CliHelper { - type Candidate = Pair; - - fn complete( - &self, - line: &str, - pos: usize, - ctx: &Context<'_>, - ) -> std::result::Result<(usize, Vec), ReadlineError> { - if is_open_quote_for_location(line, pos) { - self.completer.complete(line, pos, ctx) - } else { - Ok((0, Vec::with_capacity(0))) - } - } -} - -impl Validator for CliHelper { - fn validate(&self, ctx: &mut ValidationContext<'_>) -> Result { - let input = ctx.input().trim_end(); - self.validate_input(input) - } -} - -impl Helper for CliHelper {} - -/// Unescape input string from readline. -/// -/// The data read from stdio will be escaped, so we need to unescape the input before executing the input -pub fn unescape_input(input: &str) -> datafusion::error::Result { - let mut chars = input.chars(); - - let mut result = String::with_capacity(input.len()); - while let Some(char) = chars.next() { - if char == '\\' { - if let Some(next_char) = chars.next() { - // https://static.rust-lang.org/doc/master/reference.html#literals - result.push(match next_char { - '0' => '\0', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - _ => { - return Err(sql_datafusion_err!(ParserError::TokenizerError(format!( - "unsupported escape char: '\\{}'", - next_char - )))) - } - }); - } - } else { - result.push(char); - } - } - - Ok(result) -} - -/// Splits a string which consists of multiple queries. -pub(crate) fn split_from_semicolon(sql: String) -> Vec { - let mut commands = Vec::new(); - let mut current_command = String::new(); - let mut in_single_quote = false; - let mut in_double_quote = false; - - for c in sql.chars() { - if c == '\'' && !in_double_quote { - in_single_quote = !in_single_quote; - } else if c == '"' && !in_single_quote { - in_double_quote = !in_double_quote; - } - - if c == ';' && !in_single_quote && !in_double_quote { - if !current_command.trim().is_empty() { - commands.push(format!("{};", current_command.trim())); - current_command.clear(); - } - } else { - current_command.push(c); - } - } - - if !current_command.trim().is_empty() { - commands.push(format!("{};", current_command.trim())); - } - - commands -} - -#[cfg(test)] -mod tests { - use std::io::{BufRead, Cursor}; - - use super::*; - - fn readline_direct( - mut reader: impl BufRead, - validator: &CliHelper, - ) -> Result { - let mut input = String::new(); - - if reader.read_line(&mut input)? == 0 { - return Err(ReadlineError::Eof); - } - - validator.validate_input(&input) - } - - #[test] - fn unescape_readline_input() -> Result<()> { - let validator = CliHelper::default(); - - // should be valid - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',');" - .as_bytes(), - ), - &validator, - )?; - assert!(matches!(result, ValidationResult::Valid(None))); - - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\0');" - .as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Valid(None))); - - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\n');" - .as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Valid(None))); - - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\r');" - .as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Valid(None))); - - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\t');" - .as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Valid(None))); - - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\\');" - .as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Valid(None))); - - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',,');" - .as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Valid(None))); - - // should be invalid - let result = readline_direct( - Cursor::new( - r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\u{07}');" - .as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Invalid(Some(_)))); - - Ok(()) - } - - #[test] - fn sql_dialect() -> Result<()> { - let mut validator = CliHelper::default(); - - // should be invalid in generic dialect - let result = readline_direct(Cursor::new(r"select 1 # 2;".as_bytes()), &validator)?; - assert!( - matches!(result, ValidationResult::Invalid(Some(e)) if e.contains("Invalid statement")) - ); - - // valid in postgresql dialect - validator.set_dialect("postgresql"); - let result = readline_direct(Cursor::new(r"select 1 # 2;".as_bytes()), &validator)?; - assert!(matches!(result, ValidationResult::Valid(None))); - - Ok(()) - } - - #[test] - fn test_split_from_semicolon() { - let sql = "SELECT 1; SELECT 2;"; - let expected = vec!["SELECT 1;", "SELECT 2;"]; - assert_eq!(split_from_semicolon(sql.to_string()), expected); - - let sql = r#"SELECT ";";"#; - let expected = vec![r#"SELECT ";";"#]; - assert_eq!(split_from_semicolon(sql.to_string()), expected); - - let sql = "SELECT ';';"; - let expected = vec!["SELECT ';';"]; - assert_eq!(split_from_semicolon(sql.to_string()), expected); - - let sql = r#"SELECT 1; SELECT 'value;value'; SELECT 1 as "text;text";"#; - let expected = vec![ - "SELECT 1;", - "SELECT 'value;value';", - r#"SELECT 1 as "text;text";"#, - ]; - assert_eq!(split_from_semicolon(sql.to_string()), expected); - - let sql = ""; - let expected: Vec = Vec::new(); - assert_eq!(split_from_semicolon(sql.to_string()), expected); - - let sql = "SELECT 1"; - let expected = vec!["SELECT 1;"]; - assert_eq!(split_from_semicolon(sql.to_string()), expected); - - let sql = "SELECT 1; "; - let expected = vec!["SELECT 1;"]; - assert_eq!(split_from_semicolon(sql.to_string()), expected); - } -} diff --git a/optd-datafusion-cli/src/highlighter.rs b/optd-datafusion-cli/src/highlighter.rs deleted file mode 100644 index f3e13ed..0000000 --- a/optd-datafusion-cli/src/highlighter.rs +++ /dev/null @@ -1,127 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! The syntax highlighter. - -use std::{ - borrow::Cow::{self, Borrowed}, - fmt::Display, -}; - -use datafusion::sql::sqlparser::{ - dialect::{dialect_from_str, Dialect, GenericDialect}, - keywords::Keyword, - tokenizer::{Token, Tokenizer}, -}; -use rustyline::highlight::{CmdKind, Highlighter}; - -/// The syntax highlighter. -#[derive(Debug)] -pub struct SyntaxHighlighter { - dialect: Box, -} - -impl SyntaxHighlighter { - pub fn new(dialect: &str) -> Self { - let dialect = dialect_from_str(dialect).unwrap_or(Box::new(GenericDialect {})); - Self { dialect } - } -} - -pub struct NoSyntaxHighlighter {} - -impl Highlighter for NoSyntaxHighlighter {} - -impl Highlighter for SyntaxHighlighter { - fn highlight<'l>(&self, line: &'l str, _: usize) -> Cow<'l, str> { - let mut out_line = String::new(); - - // `with_unescape(false)` since we want to rebuild the original string. - let mut tokenizer = Tokenizer::new(self.dialect.as_ref(), line).with_unescape(false); - let tokens = tokenizer.tokenize(); - match tokens { - Ok(tokens) => { - for token in tokens.iter() { - match token { - Token::Word(w) if w.keyword != Keyword::NoKeyword => { - out_line.push_str(&Color::red(token)); - } - Token::SingleQuotedString(_) => { - out_line.push_str(&Color::green(token)); - } - other => out_line.push_str(&format!("{other}")), - } - } - out_line.into() - } - Err(_) => Borrowed(line), - } - } - - fn highlight_char(&self, line: &str, _pos: usize, _cmd: CmdKind) -> bool { - !line.is_empty() - } -} - -/// Convenient utility to return strings with [ANSI color](https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124). -struct Color {} - -impl Color { - fn green(s: impl Display) -> String { - format!("\x1b[92m{s}\x1b[0m") - } - - fn red(s: impl Display) -> String { - format!("\x1b[91m{s}\x1b[0m") - } -} - -#[cfg(test)] -mod tests { - use super::SyntaxHighlighter; - use rustyline::highlight::Highlighter; - - #[test] - fn highlighter_valid() { - let s = "SElect col_a from tab_1;"; - let highlighter = SyntaxHighlighter::new("generic"); - let out = highlighter.highlight(s, s.len()); - assert_eq!( - "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1;", - out - ); - } - - #[test] - fn highlighter_valid_with_new_line() { - let s = "SElect col_a from tab_1\n WHERE col_b = 'なにか';"; - let highlighter = SyntaxHighlighter::new("generic"); - let out = highlighter.highlight(s, s.len()); - assert_eq!( - "\u{1b}[91mSElect\u{1b}[0m col_a \u{1b}[91mfrom\u{1b}[0m tab_1\n \u{1b}[91mWHERE\u{1b}[0m col_b = \u{1b}[92m'なにか'\u{1b}[0m;", - out - ); - } - - #[test] - fn highlighter_invalid() { - let s = "SElect col_a from tab_1 WHERE col_b = ';"; - let highlighter = SyntaxHighlighter::new("generic"); - let out = highlighter.highlight(s, s.len()); - assert_eq!("SElect col_a from tab_1 WHERE col_b = ';", out); - } -} diff --git a/optd-datafusion-cli/src/lib.rs b/optd-datafusion-cli/src/lib.rs deleted file mode 100644 index fbfc924..0000000 --- a/optd-datafusion-cli/src/lib.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#![doc = include_str!("../README.md")] -pub const DATAFUSION_CLI_VERSION: &str = env!("CARGO_PKG_VERSION"); - -pub mod catalog; -pub mod cli_context; -pub mod command; -pub mod exec; -pub mod functions; -pub mod helper; -pub mod highlighter; -pub mod object_storage; -pub mod pool_type; -pub mod print_format; -pub mod print_options; diff --git a/optd-datafusion-cli/src/main.rs b/optd-datafusion-cli/src/main.rs deleted file mode 100644 index 8da3dfc..0000000 --- a/optd-datafusion-cli/src/main.rs +++ /dev/null @@ -1,446 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::collections::HashMap; -use std::env; -use std::path::Path; -use std::process::ExitCode; -use std::sync::{Arc, LazyLock}; - -use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::context::SessionConfig; -use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, MemoryPool}; -use datafusion::execution::runtime_env::RuntimeEnvBuilder; -use datafusion::prelude::SessionContext; -use optd_datafusion_cli::catalog::DynamicObjectStoreCatalog; -use optd_datafusion_cli::functions::ParquetMetadataFunc; -use optd_datafusion_cli::{ - exec, - pool_type::PoolType, - print_format::PrintFormat, - print_options::{MaxRows, PrintOptions}, - DATAFUSION_CLI_VERSION, -}; - -use clap::Parser; -use mimalloc::MiMalloc; - -#[global_allocator] -static GLOBAL: MiMalloc = MiMalloc; - -#[derive(Debug, Parser, PartialEq)] -#[clap(author, version, about, long_about= None)] -struct Args { - #[clap( - short = 'p', - long, - help = "Path to your data, default to current directory", - value_parser(parse_valid_data_dir) - )] - data_path: Option, - - #[clap( - short = 'b', - long, - help = "The batch size of each query, or use DataFusion default", - value_parser(parse_batch_size) - )] - batch_size: Option, - - #[clap( - short = 'c', - long, - num_args = 0.., - help = "Execute the given command string(s), then exit. Commands are expected to be non empty.", - value_parser(parse_command) - )] - command: Vec, - - #[clap( - short = 'm', - long, - help = "The memory pool limitation (e.g. '10g'), default to None (no limit)", - value_parser(extract_memory_pool_size) - )] - memory_limit: Option, - - #[clap( - short, - long, - num_args = 0.., - help = "Execute commands from file(s), then exit", - value_parser(parse_valid_file) - )] - file: Vec, - - #[clap( - short = 'r', - long, - num_args = 0.., - help = "Run the provided files on startup instead of ~/.datafusionrc", - value_parser(parse_valid_file), - conflicts_with = "file" - )] - rc: Option>, - - #[clap(long, value_enum, default_value_t = PrintFormat::Automatic)] - format: PrintFormat, - - #[clap( - short, - long, - help = "Reduce printing other than the results and work quietly" - )] - quiet: bool, - - #[clap( - long, - help = "Specify the memory pool type 'greedy' or 'fair'", - default_value_t = PoolType::Greedy - )] - mem_pool_type: PoolType, - - #[clap( - long, - help = "The max number of rows to display for 'Table' format\n[possible values: numbers(0/10/...), inf(no limit)]", - default_value = "40" - )] - maxrows: MaxRows, - - #[clap(long, help = "Enables console syntax highlighting")] - color: bool, - - #[clap(long, help = "Disable the optd optimizer")] - disable_optd: bool, -} - -#[tokio::main] -/// Calls [`main_inner`], then handles printing errors and returning the correct exit code -pub async fn main() -> ExitCode { - if let Err(e) = main_inner().await { - println!("Error: {e}"); - return ExitCode::FAILURE; - } - - ExitCode::SUCCESS -} - -/// Main CLI entrypoint -async fn main_inner() -> Result<()> { - env_logger::init(); - let args = Args::parse(); - - if !args.quiet { - println!("DataFusion CLI v{}", DATAFUSION_CLI_VERSION); - } - - if let Some(ref path) = args.data_path { - let p = Path::new(path); - env::set_current_dir(p).unwrap(); - }; - - let mut session_config = SessionConfig::from_env()?.with_information_schema(true); - - if let Some(batch_size) = args.batch_size { - session_config = session_config.with_batch_size(batch_size); - }; - - let mut rt_builder = RuntimeEnvBuilder::new(); - // set memory pool size - if let Some(memory_limit) = args.memory_limit { - // set memory pool type - let pool: Arc = match args.mem_pool_type { - PoolType::Fair => Arc::new(FairSpillPool::new(memory_limit)), - PoolType::Greedy => Arc::new(GreedyMemoryPool::new(memory_limit)), - }; - rt_builder = rt_builder.with_memory_pool(pool) - } - - let runtime_env = rt_builder.build_arc()?; - - // enable dynamic file query - let ctx = if args.disable_optd { - SessionContext::new_with_config_rt(session_config, runtime_env).enable_url_table() - } else { - optd_datafusion::create_df_context(Some(session_config), Some(runtime_env), None) - .await - .map_err(|e| DataFusionError::External(e.into()))? - }; - - ctx.refresh_catalogs().await?; - // install dynamic catalog provider that can register required object stores - ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( - ctx.state().catalog_list().clone(), - ctx.state_weak_ref(), - ))); - // register `parquet_metadata` table function to get metadata from parquet files - ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - - let mut print_options = PrintOptions { - format: args.format, - quiet: args.quiet, - maxrows: args.maxrows, - color: args.color, - }; - - let commands = args.command; - let files = args.file; - let rc = match args.rc { - Some(file) => file, - None => { - let mut files = Vec::new(); - let home = dirs::home_dir(); - if let Some(p) = home { - let home_rc = p.join(".datafusionrc"); - if home_rc.exists() { - files.push(home_rc.into_os_string().into_string().unwrap()); - } - } - files - } - }; - - if commands.is_empty() && files.is_empty() { - if !rc.is_empty() { - exec::exec_from_files(&ctx, rc, &print_options).await?; - } - // TODO maybe we can have thiserror for cli but for now let's keep it simple - return exec::exec_from_repl(&ctx, &mut print_options) - .await - .map_err(|e| DataFusionError::External(Box::new(e))); - } - - if !files.is_empty() { - exec::exec_from_files(&ctx, files, &print_options).await?; - } - - if !commands.is_empty() { - exec::exec_from_commands(&ctx, commands, &print_options).await?; - } - - Ok(()) -} - -fn parse_valid_file(dir: &str) -> Result { - if Path::new(dir).is_file() { - Ok(dir.to_string()) - } else { - Err(format!("Invalid file '{}'", dir)) - } -} - -fn parse_valid_data_dir(dir: &str) -> Result { - if Path::new(dir).is_dir() { - Ok(dir.to_string()) - } else { - Err(format!("Invalid data directory '{}'", dir)) - } -} - -fn parse_batch_size(size: &str) -> Result { - match size.parse::() { - Ok(size) if size > 0 => Ok(size), - _ => Err(format!("Invalid batch size '{}'", size)), - } -} - -fn parse_command(command: &str) -> Result { - if !command.is_empty() { - Ok(command.to_string()) - } else { - Err("-c flag expects only non empty commands".to_string()) - } -} - -#[derive(Debug, Clone, Copy)] -enum ByteUnit { - Byte, - KiB, - MiB, - GiB, - TiB, -} - -impl ByteUnit { - fn multiplier(&self) -> u64 { - match self { - ByteUnit::Byte => 1, - ByteUnit::KiB => 1 << 10, - ByteUnit::MiB => 1 << 20, - ByteUnit::GiB => 1 << 30, - ByteUnit::TiB => 1 << 40, - } - } -} - -fn extract_memory_pool_size(size: &str) -> Result { - static BYTE_SUFFIXES: LazyLock> = LazyLock::new(|| { - let mut m = HashMap::new(); - m.insert("b", ByteUnit::Byte); - m.insert("k", ByteUnit::KiB); - m.insert("kb", ByteUnit::KiB); - m.insert("m", ByteUnit::MiB); - m.insert("mb", ByteUnit::MiB); - m.insert("g", ByteUnit::GiB); - m.insert("gb", ByteUnit::GiB); - m.insert("t", ByteUnit::TiB); - m.insert("tb", ByteUnit::TiB); - m - }); - - static SUFFIX_REGEX: LazyLock = - LazyLock::new(|| regex::Regex::new(r"^(-?[0-9]+)([a-z]+)?$").unwrap()); - - let lower = size.to_lowercase(); - if let Some(caps) = SUFFIX_REGEX.captures(&lower) { - let num_str = caps.get(1).unwrap().as_str(); - let num = num_str - .parse::() - .map_err(|_| format!("Invalid numeric value in memory pool size '{}'", size))?; - - let suffix = caps.get(2).map(|m| m.as_str()).unwrap_or("b"); - let unit = &BYTE_SUFFIXES - .get(suffix) - .ok_or_else(|| format!("Invalid memory pool size '{}'", size))?; - let memory_pool_size = usize::try_from(unit.multiplier()) - .ok() - .and_then(|multiplier| num.checked_mul(multiplier)) - .ok_or_else(|| format!("Memory pool size '{}' is too large", size))?; - - Ok(memory_pool_size) - } else { - Err(format!("Invalid memory pool size '{}'", size)) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use datafusion::{assert_batches_eq, prelude::SessionContext}; - - fn assert_conversion(input: &str, expected: Result) { - let result = extract_memory_pool_size(input); - match expected { - Ok(v) => assert_eq!(result.unwrap(), v), - Err(e) => assert_eq!(result.unwrap_err(), e), - } - } - - #[test] - fn memory_pool_size() -> Result<(), String> { - // Test basic sizes without suffix, assumed to be bytes - assert_conversion("5", Ok(5)); - assert_conversion("100", Ok(100)); - - // Test various units - assert_conversion("5b", Ok(5)); - assert_conversion("4k", Ok(4 * 1024)); - assert_conversion("4kb", Ok(4 * 1024)); - assert_conversion("20m", Ok(20 * 1024 * 1024)); - assert_conversion("20mb", Ok(20 * 1024 * 1024)); - assert_conversion("2g", Ok(2 * 1024 * 1024 * 1024)); - assert_conversion("2gb", Ok(2 * 1024 * 1024 * 1024)); - assert_conversion("3t", Ok(3 * 1024 * 1024 * 1024 * 1024)); - assert_conversion("4tb", Ok(4 * 1024 * 1024 * 1024 * 1024)); - - // Test case insensitivity - assert_conversion("4K", Ok(4 * 1024)); - assert_conversion("4KB", Ok(4 * 1024)); - assert_conversion("20M", Ok(20 * 1024 * 1024)); - assert_conversion("20MB", Ok(20 * 1024 * 1024)); - assert_conversion("2G", Ok(2 * 1024 * 1024 * 1024)); - assert_conversion("2GB", Ok(2 * 1024 * 1024 * 1024)); - assert_conversion("2T", Ok(2 * 1024 * 1024 * 1024 * 1024)); - - // Test invalid input - assert_conversion( - "invalid", - Err("Invalid memory pool size 'invalid'".to_string()), - ); - assert_conversion("4kbx", Err("Invalid memory pool size '4kbx'".to_string())); - assert_conversion( - "-20mb", - Err("Invalid numeric value in memory pool size '-20mb'".to_string()), - ); - assert_conversion( - "-100", - Err("Invalid numeric value in memory pool size '-100'".to_string()), - ); - assert_conversion( - "12k12k", - Err("Invalid memory pool size '12k12k'".to_string()), - ); - - Ok(()) - } - - // #[tokio::test] - #[allow(dead_code)] - async fn test_parquet_metadata_works() -> Result<(), DataFusionError> { - let ctx = SessionContext::new(); - ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - - // input with single quote - let sql = - "SELECT * FROM parquet_metadata('../datafusion/core/tests/data/fixed_size_list_array.parquet')"; - let df = ctx.sql(sql).await?; - let rbs = df.collect().await?; - - let excepted = [ - "+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", - "| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |", - "+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", - "| ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | \"f0.list.item\" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 |", - "+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", - ]; - assert_batches_eq!(excepted, &rbs); - - // input with double quote - let sql = - "SELECT * FROM parquet_metadata(\"../datafusion/core/tests/data/fixed_size_list_array.parquet\")"; - let df = ctx.sql(sql).await?; - let rbs = df.collect().await?; - assert_batches_eq!(excepted, &rbs); - - Ok(()) - } - - // #[tokio::test] - #[allow(dead_code)] - async fn test_parquet_metadata_works_with_strings() -> Result<(), DataFusionError> { - let ctx = SessionContext::new(); - ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); - - // input with string columns - let sql = - "SELECT * FROM parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')"; - let df = ctx.sql(sql).await?; - let rbs = df.collect().await?; - - let excepted = [ - -"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", -"| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |", -"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", -"| ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | \"String\" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 |", -"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+" - ]; - assert_batches_eq!(excepted, &rbs); - - Ok(()) - } -} diff --git a/optd-datafusion-cli/src/object_storage.rs b/optd-datafusion-cli/src/object_storage.rs deleted file mode 100644 index fc4b446..0000000 --- a/optd-datafusion-cli/src/object_storage.rs +++ /dev/null @@ -1,632 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::fmt::{Debug, Display}; -use std::sync::Arc; - -use datafusion::common::config::{ - ConfigEntry, ConfigExtension, ConfigField, ExtensionOptions, TableOptions, Visit, -}; -use datafusion::common::{config_err, exec_datafusion_err, exec_err}; -use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::context::SessionState; - -use async_trait::async_trait; -use aws_config::BehaviorVersion; -use aws_credential_types::provider::ProvideCredentials; -use object_store::aws::{AmazonS3Builder, AwsCredential}; -use object_store::gcp::GoogleCloudStorageBuilder; -use object_store::http::HttpBuilder; -use object_store::{ClientOptions, CredentialProvider, ObjectStore}; -use url::Url; - -pub async fn get_s3_object_store_builder( - url: &Url, - aws_options: &AwsOptions, -) -> Result { - let AwsOptions { - access_key_id, - secret_access_key, - session_token, - region, - endpoint, - allow_http, - } = aws_options; - - let bucket_name = get_bucket_name(url)?; - let mut builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - - if let (Some(access_key_id), Some(secret_access_key)) = (access_key_id, secret_access_key) { - builder = builder - .with_access_key_id(access_key_id) - .with_secret_access_key(secret_access_key); - - if let Some(session_token) = session_token { - builder = builder.with_token(session_token); - } - } else { - let config = aws_config::defaults(BehaviorVersion::latest()).load().await; - if let Some(region) = config.region() { - builder = builder.with_region(region.to_string()); - } - - let credentials = config - .credentials_provider() - .ok_or_else(|| { - DataFusionError::ObjectStore(object_store::Error::Generic { - store: "S3", - source: "Failed to get S3 credentials from the environment".into(), - }) - })? - .clone(); - - let credentials = Arc::new(S3CredentialProvider { credentials }); - builder = builder.with_credentials(credentials); - } - - if let Some(region) = region { - builder = builder.with_region(region); - } - - if let Some(endpoint) = endpoint { - // Make a nicer error if the user hasn't allowed http and the endpoint - // is http as the default message is "URL scheme is not allowed" - if let Ok(endpoint_url) = Url::try_from(endpoint.as_str()) { - if !matches!(allow_http, Some(true)) && endpoint_url.scheme() == "http" { - return config_err!( - "Invalid endpoint: {endpoint}. \ - HTTP is not allowed for S3 endpoints. \ - To allow HTTP, set 'aws.allow_http' to true" - ); - } - } - - builder = builder.with_endpoint(endpoint); - } - - if let Some(allow_http) = allow_http { - builder = builder.with_allow_http(*allow_http); - } - - Ok(builder) -} - -#[derive(Debug)] -struct S3CredentialProvider { - credentials: aws_credential_types::provider::SharedCredentialsProvider, -} - -#[async_trait] -impl CredentialProvider for S3CredentialProvider { - type Credential = AwsCredential; - - async fn get_credential(&self) -> object_store::Result> { - let creds = self.credentials.provide_credentials().await.map_err(|e| { - object_store::Error::Generic { - store: "S3", - source: Box::new(e), - } - })?; - Ok(Arc::new(AwsCredential { - key_id: creds.access_key_id().to_string(), - secret_key: creds.secret_access_key().to_string(), - token: creds.session_token().map(ToString::to_string), - })) - } -} - -pub fn get_oss_object_store_builder( - url: &Url, - aws_options: &AwsOptions, -) -> Result { - get_object_store_builder(url, aws_options, true) -} - -pub fn get_cos_object_store_builder( - url: &Url, - aws_options: &AwsOptions, -) -> Result { - get_object_store_builder(url, aws_options, false) -} - -fn get_object_store_builder( - url: &Url, - aws_options: &AwsOptions, - virtual_hosted_style_request: bool, -) -> Result { - let bucket_name = get_bucket_name(url)?; - let mut builder = AmazonS3Builder::from_env() - .with_virtual_hosted_style_request(virtual_hosted_style_request) - .with_bucket_name(bucket_name) - // oss/cos don't care about the "region" field - .with_region("do_not_care"); - - if let (Some(access_key_id), Some(secret_access_key)) = - (&aws_options.access_key_id, &aws_options.secret_access_key) - { - builder = builder - .with_access_key_id(access_key_id) - .with_secret_access_key(secret_access_key); - } - - if let Some(endpoint) = &aws_options.endpoint { - builder = builder.with_endpoint(endpoint); - } - - Ok(builder) -} - -pub fn get_gcs_object_store_builder( - url: &Url, - gs_options: &GcpOptions, -) -> Result { - let bucket_name = get_bucket_name(url)?; - let mut builder = GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name); - - if let Some(service_account_path) = &gs_options.service_account_path { - builder = builder.with_service_account_path(service_account_path); - } - - if let Some(service_account_key) = &gs_options.service_account_key { - builder = builder.with_service_account_key(service_account_key); - } - - if let Some(application_credentials_path) = &gs_options.application_credentials_path { - builder = builder.with_application_credentials(application_credentials_path); - } - - Ok(builder) -} - -fn get_bucket_name(url: &Url) -> Result<&str> { - url.host_str().ok_or_else(|| { - DataFusionError::Execution(format!( - "Not able to parse bucket name from url: {}", - url.as_str() - )) - }) -} - -/// This struct encapsulates AWS options one uses when setting up object storage. -#[derive(Default, Debug, Clone)] -pub struct AwsOptions { - /// Access Key ID - pub access_key_id: Option, - /// Secret Access Key - pub secret_access_key: Option, - /// Session token - pub session_token: Option, - /// AWS Region - pub region: Option, - /// OSS or COS Endpoint - pub endpoint: Option, - /// Allow HTTP (otherwise will always use https) - pub allow_http: Option, -} - -impl ExtensionOptions for AwsOptions { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn cloned(&self) -> Box { - Box::new(self.clone()) - } - - fn set(&mut self, key: &str, value: &str) -> Result<()> { - let (_key, aws_key) = key.split_once('.').unwrap_or((key, "")); - let (key, rem) = aws_key.split_once('.').unwrap_or((aws_key, "")); - match key { - "access_key_id" => { - self.access_key_id.set(rem, value)?; - } - "secret_access_key" => { - self.secret_access_key.set(rem, value)?; - } - "session_token" => { - self.session_token.set(rem, value)?; - } - "region" => { - self.region.set(rem, value)?; - } - "oss" | "cos" | "endpoint" => { - self.endpoint.set(rem, value)?; - } - "allow_http" => { - self.allow_http.set(rem, value)?; - } - _ => { - return config_err!("Config value \"{}\" not found on AwsOptions", rem); - } - } - Ok(()) - } - - fn entries(&self) -> Vec { - struct Visitor(Vec); - - impl Visit for Visitor { - fn some(&mut self, key: &str, value: V, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: Some(value.to_string()), - description, - }) - } - - fn none(&mut self, key: &str, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: None, - description, - }) - } - } - - let mut v = Visitor(vec![]); - self.access_key_id.visit(&mut v, "access_key_id", ""); - self.secret_access_key - .visit(&mut v, "secret_access_key", ""); - self.session_token.visit(&mut v, "session_token", ""); - self.region.visit(&mut v, "region", ""); - self.endpoint.visit(&mut v, "endpoint", ""); - self.allow_http.visit(&mut v, "allow_http", ""); - v.0 - } -} - -impl ConfigExtension for AwsOptions { - const PREFIX: &'static str = "aws"; -} - -/// This struct encapsulates GCP options one uses when setting up object storage. -#[derive(Debug, Clone, Default)] -pub struct GcpOptions { - /// Service account path - pub service_account_path: Option, - /// Service account key - pub service_account_key: Option, - /// Application credentials path - pub application_credentials_path: Option, -} - -impl ExtensionOptions for GcpOptions { - fn as_any(&self) -> &dyn Any { - self - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn cloned(&self) -> Box { - Box::new(self.clone()) - } - - fn set(&mut self, key: &str, value: &str) -> Result<()> { - let (_key, rem) = key.split_once('.').unwrap_or((key, "")); - match rem { - "service_account_path" => { - self.service_account_path.set(rem, value)?; - } - "service_account_key" => { - self.service_account_key.set(rem, value)?; - } - "application_credentials_path" => { - self.application_credentials_path.set(rem, value)?; - } - _ => { - return config_err!("Config value \"{}\" not found on GcpOptions", rem); - } - } - Ok(()) - } - - fn entries(&self) -> Vec { - struct Visitor(Vec); - - impl Visit for Visitor { - fn some(&mut self, key: &str, value: V, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: Some(value.to_string()), - description, - }) - } - - fn none(&mut self, key: &str, description: &'static str) { - self.0.push(ConfigEntry { - key: key.to_string(), - value: None, - description, - }) - } - } - - let mut v = Visitor(vec![]); - self.service_account_path - .visit(&mut v, "service_account_path", ""); - self.service_account_key - .visit(&mut v, "service_account_key", ""); - self.application_credentials_path - .visit(&mut v, "application_credentials_path", ""); - v.0 - } -} - -impl ConfigExtension for GcpOptions { - const PREFIX: &'static str = "gcp"; -} - -pub(crate) async fn get_object_store( - state: &SessionState, - scheme: &str, - url: &Url, - table_options: &TableOptions, -) -> Result, DataFusionError> { - let store: Arc = match scheme { - "s3" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 's3' scheme"); - }; - let builder = get_s3_object_store_builder(url, options).await?; - Arc::new(builder.build()?) - } - "oss" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 'oss' scheme"); - }; - let builder = get_oss_object_store_builder(url, options)?; - Arc::new(builder.build()?) - } - "cos" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 'cos' scheme"); - }; - let builder = get_cos_object_store_builder(url, options)?; - Arc::new(builder.build()?) - } - "gs" | "gcs" => { - let Some(options) = table_options.extensions.get::() else { - return exec_err!("Given table options incompatible with the 'gs'/'gcs' scheme"); - }; - let builder = get_gcs_object_store_builder(url, options)?; - Arc::new(builder.build()?) - } - "http" | "https" => Arc::new( - HttpBuilder::new() - .with_client_options(ClientOptions::new().with_allow_http(true)) - .with_url(url.origin().ascii_serialization()) - .build()?, - ), - _ => { - // For other types, try to get from `object_store_registry`: - state - .runtime_env() - .object_store_registry - .get_store(url) - .map_err(|_| exec_datafusion_err!("Unsupported object store scheme: {}", scheme))? - } - }; - Ok(store) -} - -#[cfg(test)] -mod tests { - use crate::cli_context::CliSessionContext; - - use super::*; - - use datafusion::common::plan_err; - use datafusion::{ - datasource::listing::ListingTableUrl, - logical_expr::{DdlStatement, LogicalPlan}, - prelude::SessionContext, - }; - - use object_store::{aws::AmazonS3ConfigKey, gcp::GoogleConfigKey}; - - #[tokio::test] - async fn s3_object_store_builder() -> Result<()> { - // "fake" is uppercase to ensure the values are not lowercased when parsed - let access_key_id = "FAKE_access_key_id"; - let secret_access_key = "FAKE_secret_access_key"; - let region = "fake_us-east-2"; - let endpoint = "endpoint33"; - let session_token = "FAKE_session_token"; - let location = "s3://bucket/path/FAKE/file.parquet"; - - let table_url = ListingTableUrl::parse(location)?; - let scheme = table_url.scheme(); - let sql = format!( - "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS\ - ('aws.access_key_id' '{access_key_id}', \ - 'aws.secret_access_key' '{secret_access_key}', \ - 'aws.region' '{region}', \ - 'aws.session_token' {session_token}, \ - 'aws.endpoint' '{endpoint}'\ - ) LOCATION '{location}'" - ); - - let ctx = SessionContext::new(); - let mut plan = ctx.state().create_logical_plan(&sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - ctx.register_table_options_extension_from_scheme(scheme); - let mut table_options = ctx.state().default_table_options(); - table_options.alter_with_string_hash_map(&cmd.options)?; - let aws_options = table_options.extensions.get::().unwrap(); - let builder = get_s3_object_store_builder(table_url.as_ref(), aws_options).await?; - // get the actual configuration information, then assert_eq! - let config = [ - (AmazonS3ConfigKey::AccessKeyId, access_key_id), - (AmazonS3ConfigKey::SecretAccessKey, secret_access_key), - (AmazonS3ConfigKey::Region, region), - (AmazonS3ConfigKey::Endpoint, endpoint), - (AmazonS3ConfigKey::Token, session_token), - ]; - for (key, value) in config { - assert_eq!(value, builder.get_config_value(&key).unwrap()); - } - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - Ok(()) - } - - #[tokio::test] - async fn s3_object_store_builder_allow_http_error() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let endpoint = "http://endpoint33"; - let location = "s3://bucket/path/file.parquet"; - - let table_url = ListingTableUrl::parse(location)?; - let scheme = table_url.scheme(); - let sql = format!( - "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS\ - ('aws.access_key_id' '{access_key_id}', \ - 'aws.secret_access_key' '{secret_access_key}', \ - 'aws.endpoint' '{endpoint}'\ - ) LOCATION '{location}'" - ); - - let ctx = SessionContext::new(); - let mut plan = ctx.state().create_logical_plan(&sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - ctx.register_table_options_extension_from_scheme(scheme); - let mut table_options = ctx.state().default_table_options(); - table_options.alter_with_string_hash_map(&cmd.options)?; - let aws_options = table_options.extensions.get::().unwrap(); - let err = get_s3_object_store_builder(table_url.as_ref(), aws_options) - .await - .unwrap_err(); - - assert_eq!(err.to_string(), "Invalid or Unsupported Configuration: Invalid endpoint: http://endpoint33. HTTP is not allowed for S3 endpoints. To allow HTTP, set 'aws.allow_http' to true"); - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - // Now add `allow_http` to the options and check if it works - let sql = format!( - "CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS\ - ('aws.access_key_id' '{access_key_id}', \ - 'aws.secret_access_key' '{secret_access_key}', \ - 'aws.endpoint' '{endpoint}',\ - 'aws.allow_http' 'true'\ - ) LOCATION '{location}'" - ); - - let mut plan = ctx.state().create_logical_plan(&sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - ctx.register_table_options_extension_from_scheme(scheme); - let mut table_options = ctx.state().default_table_options(); - table_options.alter_with_string_hash_map(&cmd.options)?; - let aws_options = table_options.extensions.get::().unwrap(); - // ensure this isn't an error - get_s3_object_store_builder(table_url.as_ref(), aws_options).await?; - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - Ok(()) - } - - #[tokio::test] - async fn oss_object_store_builder() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; - let endpoint = "fake_endpoint"; - let location = "oss://bucket/path/file.parquet"; - - let table_url = ListingTableUrl::parse(location)?; - let scheme = table_url.scheme(); - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('aws.access_key_id' '{access_key_id}', 'aws.secret_access_key' '{secret_access_key}', 'aws.oss.endpoint' '{endpoint}') LOCATION '{location}'"); - - let ctx = SessionContext::new(); - let mut plan = ctx.state().create_logical_plan(&sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - ctx.register_table_options_extension_from_scheme(scheme); - let mut table_options = ctx.state().default_table_options(); - table_options.alter_with_string_hash_map(&cmd.options)?; - let aws_options = table_options.extensions.get::().unwrap(); - let builder = get_oss_object_store_builder(table_url.as_ref(), aws_options)?; - // get the actual configuration information, then assert_eq! - let config = [ - (AmazonS3ConfigKey::AccessKeyId, access_key_id), - (AmazonS3ConfigKey::SecretAccessKey, secret_access_key), - (AmazonS3ConfigKey::Endpoint, endpoint), - ]; - for (key, value) in config { - assert_eq!(value, builder.get_config_value(&key).unwrap()); - } - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - Ok(()) - } - - #[tokio::test] - async fn gcs_object_store_builder() -> Result<()> { - let service_account_path = "fake_service_account_path"; - let service_account_key = - "{\"private_key\": \"fake_private_key.pem\",\"client_email\":\"fake_client_email\"}"; - let application_credentials_path = "fake_application_credentials_path"; - let location = "gcs://bucket/path/file.parquet"; - - let table_url = ListingTableUrl::parse(location)?; - let scheme = table_url.scheme(); - let sql = format!("CREATE EXTERNAL TABLE test STORED AS PARQUET OPTIONS('gcp.service_account_path' '{service_account_path}', 'gcp.service_account_key' '{service_account_key}', 'gcp.application_credentials_path' '{application_credentials_path}') LOCATION '{location}'"); - - let ctx = SessionContext::new(); - let mut plan = ctx.state().create_logical_plan(&sql).await?; - - if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = &mut plan { - ctx.register_table_options_extension_from_scheme(scheme); - let mut table_options = ctx.state().default_table_options(); - table_options.alter_with_string_hash_map(&cmd.options)?; - let gcp_options = table_options.extensions.get::().unwrap(); - let builder = get_gcs_object_store_builder(table_url.as_ref(), gcp_options)?; - // get the actual configuration information, then assert_eq! - let config = [ - (GoogleConfigKey::ServiceAccount, service_account_path), - (GoogleConfigKey::ServiceAccountKey, service_account_key), - ( - GoogleConfigKey::ApplicationCredentials, - application_credentials_path, - ), - ]; - for (key, value) in config { - assert_eq!(value, builder.get_config_value(&key).unwrap()); - } - } else { - return plan_err!("LogicalPlan is not a CreateExternalTable"); - } - - Ok(()) - } -} diff --git a/optd-datafusion-cli/src/pool_type.rs b/optd-datafusion-cli/src/pool_type.rs deleted file mode 100644 index 269790b..0000000 --- a/optd-datafusion-cli/src/pool_type.rs +++ /dev/null @@ -1,48 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::{ - fmt::{self, Display, Formatter}, - str::FromStr, -}; - -#[derive(PartialEq, Debug, Clone)] -pub enum PoolType { - Greedy, - Fair, -} - -impl FromStr for PoolType { - type Err = String; - - fn from_str(s: &str) -> Result { - match s { - "Greedy" | "greedy" => Ok(PoolType::Greedy), - "Fair" | "fair" => Ok(PoolType::Fair), - _ => Err(format!("Invalid memory pool type '{}'", s)), - } - } -} - -impl Display for PoolType { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - match self { - PoolType::Greedy => write!(f, "greedy"), - PoolType::Fair => write!(f, "fair"), - } - } -} diff --git a/optd-datafusion-cli/src/print_format.rs b/optd-datafusion-cli/src/print_format.rs deleted file mode 100644 index bd5c678..0000000 --- a/optd-datafusion-cli/src/print_format.rs +++ /dev/null @@ -1,691 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Print format variants - -use std::str::FromStr; - -use crate::print_options::MaxRows; - -use arrow::csv::writer::WriterBuilder; -use arrow::datatypes::SchemaRef; -use arrow::json::{ArrayWriter, LineDelimitedWriter}; -use arrow::record_batch::RecordBatch; -use arrow::util::pretty::pretty_format_batches_with_options; -use datafusion::common::format::DEFAULT_CLI_FORMAT_OPTIONS; -use datafusion::error::Result; - -/// Allow records to be printed in different formats -#[derive(Debug, PartialEq, Eq, clap::ValueEnum, Clone, Copy)] -pub enum PrintFormat { - Csv, - Tsv, - Table, - Json, - NdJson, - Automatic, -} - -impl FromStr for PrintFormat { - type Err = String; - - fn from_str(s: &str) -> Result { - clap::ValueEnum::from_str(s, true) - } -} - -macro_rules! batches_to_json { - ($WRITER: ident, $writer: expr, $batches: expr) => {{ - { - if !$batches.is_empty() { - let mut json_writer = $WRITER::new(&mut *$writer); - for batch in $batches { - json_writer.write(batch)?; - } - json_writer.finish()?; - json_finish!($WRITER, $writer); - } - } - Ok(()) as Result<()> - }}; -} - -macro_rules! json_finish { - (ArrayWriter, $writer: expr) => {{ - writeln!($writer)?; - }}; - (LineDelimitedWriter, $writer: expr) => {{}}; -} - -fn print_batches_with_sep( - writer: &mut W, - batches: &[RecordBatch], - delimiter: u8, - with_header: bool, -) -> Result<()> { - let builder = WriterBuilder::new() - .with_header(with_header) - .with_delimiter(delimiter); - let mut csv_writer = builder.build(writer); - - for batch in batches { - csv_writer.write(batch)?; - } - - Ok(()) -} - -fn keep_only_maxrows(s: &str, maxrows: usize) -> String { - let lines: Vec = s.lines().map(String::from).collect(); - - assert!(lines.len() >= maxrows + 4); // 4 lines for top and bottom border - - let last_line = &lines[lines.len() - 1]; // bottom border line - - let spaces = last_line.len().saturating_sub(4); - let dotted_line = format!("| .{:( - writer: &mut W, - batches: &[RecordBatch], - maxrows: MaxRows, -) -> Result<()> { - match maxrows { - MaxRows::Limited(maxrows) => { - // Filter batches to meet the maxrows condition - let mut filtered_batches = Vec::new(); - let mut row_count: usize = 0; - let mut over_limit = false; - for batch in batches { - if row_count + batch.num_rows() > maxrows { - // If adding this batch exceeds maxrows, slice the batch - let limit = maxrows - row_count; - let sliced_batch = batch.slice(0, limit); - filtered_batches.push(sliced_batch); - over_limit = true; - break; - } else { - filtered_batches.push(batch.clone()); - row_count += batch.num_rows(); - } - } - - let formatted = - pretty_format_batches_with_options(&filtered_batches, &DEFAULT_CLI_FORMAT_OPTIONS)?; - if over_limit { - let mut formatted_str = format!("{}", formatted); - formatted_str = keep_only_maxrows(&formatted_str, maxrows); - writeln!(writer, "{}", formatted_str)?; - } else { - writeln!(writer, "{}", formatted)?; - } - } - MaxRows::Unlimited => { - let formatted = - pretty_format_batches_with_options(batches, &DEFAULT_CLI_FORMAT_OPTIONS)?; - writeln!(writer, "{}", formatted)?; - } - } - - Ok(()) -} - -impl PrintFormat { - /// Print the batches to a writer using the specified format - pub fn print_batches( - &self, - writer: &mut W, - schema: SchemaRef, - batches: &[RecordBatch], - maxrows: MaxRows, - with_header: bool, - ) -> Result<()> { - // filter out any empty batches - let batches: Vec<_> = batches - .iter() - .filter(|b| b.num_rows() > 0) - .cloned() - .collect(); - if batches.is_empty() { - return self.print_empty(writer, schema); - } - - match self { - Self::Csv | Self::Automatic => { - print_batches_with_sep(writer, &batches, b',', with_header) - } - Self::Tsv => print_batches_with_sep(writer, &batches, b'\t', with_header), - Self::Table => { - if maxrows == MaxRows::Limited(0) { - return Ok(()); - } - format_batches_with_maxrows(writer, &batches, maxrows) - } - Self::Json => batches_to_json!(ArrayWriter, writer, &batches), - Self::NdJson => batches_to_json!(LineDelimitedWriter, writer, &batches), - } - } - - /// Print when the result batches contain no rows - fn print_empty(&self, writer: &mut W, schema: SchemaRef) -> Result<()> { - match self { - // Print column headers for Table format - Self::Table if !schema.fields().is_empty() => { - let empty_batch = RecordBatch::new_empty(schema); - let formatted = pretty_format_batches_with_options( - &[empty_batch], - &DEFAULT_CLI_FORMAT_OPTIONS, - )?; - writeln!(writer, "{}", formatted)?; - } - _ => {} - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::Arc; - - use arrow::array::Int32Array; - use arrow::datatypes::{DataType, Field, Schema}; - - #[test] - fn print_empty() { - for format in [ - PrintFormat::Csv, - PrintFormat::Tsv, - PrintFormat::Json, - PrintFormat::NdJson, - PrintFormat::Automatic, - ] { - // no output for empty batches, even with header set - PrintBatchesTest::new() - .with_format(format) - .with_schema(three_column_schema()) - .with_batches(vec![]) - .with_expected(&[""]) - .run(); - } - - // output column headers for empty batches when format is Table - #[rustfmt::skip] - let expected = &[ - "+---+---+---+", - "| a | b | c |", - "+---+---+---+", - "+---+---+---+", - ]; - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_schema(three_column_schema()) - .with_batches(vec![]) - .with_expected(expected) - .run(); - } - - #[test] - fn print_csv_no_header() { - #[rustfmt::skip] - let expected = &[ - "1,4,7", - "2,5,8", - "3,6,9", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Csv) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::No) - .with_expected(expected) - .run(); - } - - #[test] - fn print_csv_with_header() { - #[rustfmt::skip] - let expected = &[ - "a,b,c", - "1,4,7", - "2,5,8", - "3,6,9", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Csv) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::Yes) - .with_expected(expected) - .run(); - } - - #[test] - fn print_tsv_no_header() { - #[rustfmt::skip] - let expected = &[ - "1\t4\t7", - "2\t5\t8", - "3\t6\t9", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Tsv) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::No) - .with_expected(expected) - .run(); - } - - #[test] - fn print_tsv_with_header() { - #[rustfmt::skip] - let expected = &[ - "a\tb\tc", - "1\t4\t7", - "2\t5\t8", - "3\t6\t9", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Tsv) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::Yes) - .with_expected(expected) - .run(); - } - - #[test] - fn print_table() { - let expected = &[ - "+---+---+---+", - "| a | b | c |", - "+---+---+---+", - "| 1 | 4 | 7 |", - "| 2 | 5 | 8 |", - "| 3 | 6 | 9 |", - "+---+---+---+", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::Ignored) - .with_expected(expected) - .run(); - } - #[test] - fn print_json() { - let expected = &[r#"[{"a":1,"b":4,"c":7},{"a":2,"b":5,"c":8},{"a":3,"b":6,"c":9}]"#]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Json) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::Ignored) - .with_expected(expected) - .run(); - } - - #[test] - fn print_ndjson() { - let expected = &[ - r#"{"a":1,"b":4,"c":7}"#, - r#"{"a":2,"b":5,"c":8}"#, - r#"{"a":3,"b":6,"c":9}"#, - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::NdJson) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::Ignored) - .with_expected(expected) - .run(); - } - - #[test] - fn print_automatic_no_header() { - #[rustfmt::skip] - let expected = &[ - "1,4,7", - "2,5,8", - "3,6,9", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Automatic) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::No) - .with_expected(expected) - .run(); - } - #[test] - fn print_automatic_with_header() { - #[rustfmt::skip] - let expected = &[ - "a,b,c", - "1,4,7", - "2,5,8", - "3,6,9", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Automatic) - .with_batches(split_batch(three_column_batch())) - .with_header(WithHeader::Yes) - .with_expected(expected) - .run(); - } - - #[test] - fn print_maxrows_unlimited() { - #[rustfmt::skip] - let expected = &[ - "+---+", - "| a |", - "+---+", - "| 1 |", - "| 2 |", - "| 3 |", - "+---+", - ]; - - // should print out entire output with no truncation if unlimited or - // limit greater than number of batches or equal to the number of batches - for max_rows in [MaxRows::Unlimited, MaxRows::Limited(5), MaxRows::Limited(3)] { - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_schema(one_column_schema()) - .with_batches(vec![one_column_batch()]) - .with_maxrows(max_rows) - .with_expected(expected) - .run(); - } - } - - #[test] - fn print_maxrows_limited_one_batch() { - #[rustfmt::skip] - let expected = &[ - "+---+", - "| a |", - "+---+", - "| 1 |", - "| . |", - "| . |", - "| . |", - "+---+", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_batches(vec![one_column_batch()]) - .with_maxrows(MaxRows::Limited(1)) - .with_expected(expected) - .run(); - } - - #[test] - fn print_maxrows_limited_multi_batched() { - #[rustfmt::skip] - let expected = &[ - "+---+", - "| a |", - "+---+", - "| 1 |", - "| 2 |", - "| 3 |", - "| 1 |", - "| 2 |", - "| . |", - "| . |", - "| . |", - "+---+", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_batches(vec![ - one_column_batch(), - one_column_batch(), - one_column_batch(), - ]) - .with_maxrows(MaxRows::Limited(5)) - .with_expected(expected) - .run(); - } - - #[test] - fn test_print_batches_empty_batches() { - let batch = one_column_batch(); - let empty_batch = RecordBatch::new_empty(batch.schema()); - - #[rustfmt::skip] - let expected =&[ - "+---+", - "| a |", - "+---+", - "| 1 |", - "| 2 |", - "| 3 |", - "+---+", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_batches(vec![empty_batch.clone(), batch, empty_batch]) - .with_expected(expected) - .run(); - } - - #[test] - fn test_print_batches_empty_batch() { - let empty_batch = RecordBatch::new_empty(one_column_batch().schema()); - - // Print column headers for empty batch when format is Table - #[rustfmt::skip] - let expected =&[ - "+---+", - "| a |", - "+---+", - "+---+", - ]; - - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_schema(one_column_schema()) - .with_batches(vec![empty_batch]) - .with_header(WithHeader::Yes) - .with_expected(expected) - .run(); - - // No output for empty batch when schema contains no columns - let empty_batch = RecordBatch::new_empty(Arc::new(Schema::empty())); - let expected = &[""]; - PrintBatchesTest::new() - .with_format(PrintFormat::Table) - .with_schema(Arc::new(Schema::empty())) - .with_batches(vec![empty_batch]) - .with_header(WithHeader::Yes) - .with_expected(expected) - .run(); - } - - #[derive(Debug)] - struct PrintBatchesTest { - format: PrintFormat, - schema: SchemaRef, - batches: Vec, - maxrows: MaxRows, - with_header: WithHeader, - expected: Vec<&'static str>, - } - - /// How to test with_header - #[derive(Debug, Clone)] - enum WithHeader { - Yes, - No, - /// output should be the same with or without header - Ignored, - } - - impl PrintBatchesTest { - fn new() -> Self { - Self { - format: PrintFormat::Table, - schema: Arc::new(Schema::empty()), - batches: vec![], - maxrows: MaxRows::Unlimited, - with_header: WithHeader::Ignored, - expected: vec![], - } - } - - /// set the format - fn with_format(mut self, format: PrintFormat) -> Self { - self.format = format; - self - } - - // set the schema - fn with_schema(mut self, schema: SchemaRef) -> Self { - self.schema = schema; - self - } - - /// set the batches to convert - fn with_batches(mut self, batches: Vec) -> Self { - self.batches = batches; - self - } - - /// set maxrows - fn with_maxrows(mut self, maxrows: MaxRows) -> Self { - self.maxrows = maxrows; - self - } - - /// set with_header - fn with_header(mut self, with_header: WithHeader) -> Self { - self.with_header = with_header; - self - } - - /// set expected output - fn with_expected(mut self, expected: &[&'static str]) -> Self { - self.expected = expected.to_vec(); - self - } - - /// run the test - fn run(self) { - let actual = self.output(); - let actual: Vec<_> = actual.trim_end().split('\n').collect(); - let expected = self.expected; - assert_eq!( - actual, expected, - "\n\nactual:\n{actual:#?}\n\nexpected:\n{expected:#?}" - ); - } - - /// formats batches using parameters and returns the resulting output - fn output(&self) -> String { - match self.with_header { - WithHeader::Yes => self.output_with_header(true), - WithHeader::No => self.output_with_header(false), - WithHeader::Ignored => { - let output = self.output_with_header(true); - // ensure the output is the same without header - let output_without_header = self.output_with_header(false); - assert_eq!( - output, output_without_header, - "Expected output to be the same with or without header" - ); - output - } - } - } - - fn output_with_header(&self, with_header: bool) -> String { - let mut buffer: Vec = vec![]; - self.format - .print_batches( - &mut buffer, - self.schema.clone(), - &self.batches, - self.maxrows, - with_header, - ) - .unwrap(); - String::from_utf8(buffer).unwrap() - } - } - - /// Return a schema with three columns - fn three_column_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Int32, false), - Field::new("c", DataType::Int32, false), - ])) - } - - /// Return a batch with three columns and three rows - fn three_column_batch() -> RecordBatch { - RecordBatch::try_new( - three_column_schema(), - vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(Int32Array::from(vec![4, 5, 6])), - Arc::new(Int32Array::from(vec![7, 8, 9])), - ], - ) - .unwrap() - } - - /// Return a schema with one column - fn one_column_schema() -> SchemaRef { - Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])) - } - - /// return a batch with one column and three rows - fn one_column_batch() -> RecordBatch { - RecordBatch::try_new( - one_column_schema(), - vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], - ) - .unwrap() - } - - /// Slice the record batch into 2 batches - fn split_batch(batch: RecordBatch) -> Vec { - assert!(batch.num_rows() > 1); - let split = batch.num_rows() / 2; - vec![ - batch.slice(0, split), - batch.slice(split, batch.num_rows() - split), - ] - } -} diff --git a/optd-datafusion-cli/src/print_options.rs b/optd-datafusion-cli/src/print_options.rs deleted file mode 100644 index 9218d2b..0000000 --- a/optd-datafusion-cli/src/print_options.rs +++ /dev/null @@ -1,170 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::fmt::{Display, Formatter}; -use std::io::Write; -use std::pin::Pin; -use std::str::FromStr; - -use crate::print_format::PrintFormat; - -use arrow::datatypes::SchemaRef; -use arrow::record_batch::RecordBatch; -use datafusion::common::instant::Instant; -use datafusion::common::DataFusionError; -use datafusion::error::Result; -use datafusion::physical_plan::RecordBatchStream; - -use futures::StreamExt; - -#[derive(Debug, Clone, PartialEq, Copy)] -pub enum MaxRows { - /// show all rows in the output - Unlimited, - /// Only show n rows - Limited(usize), -} - -impl FromStr for MaxRows { - type Err = String; - - fn from_str(maxrows: &str) -> Result { - if maxrows.to_lowercase() == "inf" - || maxrows.to_lowercase() == "infinite" - || maxrows.to_lowercase() == "none" - { - Ok(Self::Unlimited) - } else { - match maxrows.parse::() { - Ok(nrows) => Ok(Self::Limited(nrows)), - _ => Err(format!("Invalid maxrows {}. Valid inputs are natural numbers or \'none\', \'inf\', or \'infinite\' for no limit.", maxrows)), - } - } - } -} - -impl Display for MaxRows { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Self::Unlimited => write!(f, "unlimited"), - Self::Limited(max_rows) => write!(f, "at most {max_rows}"), - } - } -} - -#[derive(Debug, Clone)] -pub struct PrintOptions { - pub format: PrintFormat, - pub quiet: bool, - pub maxrows: MaxRows, - pub color: bool, -} - -// Returns the query execution details formatted -fn get_execution_details_formatted( - row_count: usize, - maxrows: MaxRows, - query_start_time: Instant, -) -> String { - let nrows_shown_msg = match maxrows { - MaxRows::Limited(nrows) if nrows < row_count => { - format!("(First {nrows} displayed. Use --maxrows to adjust)") - } - _ => String::new(), - }; - - format!( - "{} row(s) fetched. {}\nElapsed {:.3} seconds.\n", - row_count, - nrows_shown_msg, - query_start_time.elapsed().as_secs_f64() - ) -} - -impl PrintOptions { - /// Print the batches to stdout using the specified format - pub fn print_batches( - &self, - schema: SchemaRef, - batches: &[RecordBatch], - query_start_time: Instant, - ) -> Result<()> { - let stdout = std::io::stdout(); - let mut writer = stdout.lock(); - - self.format - .print_batches(&mut writer, schema, batches, self.maxrows, true)?; - - let row_count: usize = batches.iter().map(|b| b.num_rows()).sum(); - let formatted_exec_details = get_execution_details_formatted( - row_count, - if self.format == PrintFormat::Table { - self.maxrows - } else { - MaxRows::Unlimited - }, - query_start_time, - ); - - if !self.quiet { - writeln!(writer, "{formatted_exec_details}")?; - } - - Ok(()) - } - - /// Print the stream to stdout using the specified format - pub async fn print_stream( - &self, - mut stream: Pin>, - query_start_time: Instant, - ) -> Result<()> { - if self.format == PrintFormat::Table { - return Err(DataFusionError::External( - "PrintFormat::Table is not implemented".to_string().into(), - )); - }; - - let stdout = std::io::stdout(); - let mut writer = stdout.lock(); - - let mut row_count = 0_usize; - let mut with_header = true; - - while let Some(maybe_batch) = stream.next().await { - let batch = maybe_batch?; - row_count += batch.num_rows(); - self.format.print_batches( - &mut writer, - batch.schema(), - &[batch], - MaxRows::Unlimited, - with_header, - )?; - with_header = false; - } - - let formatted_exec_details = - get_execution_details_formatted(row_count, MaxRows::Unlimited, query_start_time); - - if !self.quiet { - writeln!(writer, "{formatted_exec_details}")?; - } - - Ok(()) - } -} diff --git a/optd-datafusion-cli/tests/cli_integration.rs b/optd-datafusion-cli/tests/cli_integration.rs deleted file mode 100644 index 43f3949..0000000 --- a/optd-datafusion-cli/tests/cli_integration.rs +++ /dev/null @@ -1,57 +0,0 @@ -#![allow(unused)] - -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::process::Command; - -use assert_cmd::prelude::{CommandCargoExt, OutputAssertExt}; -use predicates::prelude::predicate; -use rstest::rstest; - -#[cfg(test)] -#[ctor::ctor] -fn init() { - // Enable RUST_LOG logging configuration for tests - let _ = env_logger::try_init(); -} - -// Disabled due to https://github.com/apache/datafusion/issues/10793 -// #[cfg(not(target_family = "windows"))] -// #[rstest] -// #[case::exec_from_commands( -// ["--command", "select 1", "--format", "json", "-q"], -// "[{\"Int64(1)\":1}]\n" -// )] -// #[case::exec_multiple_statements( -// ["--command", "select 1; select 2;", "--format", "json", "-q"], -// "[{\"Int64(1)\":1}]\n[{\"Int64(2)\":2}]\n" -// )] -// #[case::exec_from_files( -// ["--file", "tests/data/sql.txt", "--format", "json", "-q"], -// "[{\"Int64(1)\":1}]\n" -// )] -// #[case::set_batch_size( -// ["--command", "show datafusion.execution.batch_size", "--format", "json", "-q", "-b", "1"], -// "[{\"name\":\"datafusion.execution.batch_size\",\"value\":\"1\"}]\n" -// )] -// #[test] -// fn cli_quick_test<'a>(#[case] args: impl IntoIterator, #[case] expected: &str) { -// let mut cmd = Command::cargo_bin("optd-datafusion-cli").unwrap(); -// cmd.args(args); -// cmd.assert().stdout(predicate::eq(expected)); -// } diff --git a/optd-datafusion-cli/tests/data/sql.txt b/optd-datafusion-cli/tests/data/sql.txt deleted file mode 100644 index 9e13a3e..0000000 --- a/optd-datafusion-cli/tests/data/sql.txt +++ /dev/null @@ -1 +0,0 @@ -select 1; \ No newline at end of file From e4087ae281c1039f74d45cd04fa3209f42d9a95e Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 2 Mar 2025 16:16:02 -0500 Subject: [PATCH 5/7] clean up conversion layer --- optd-datafusion/src/converter/from_optd.rs | 59 +++--- optd-datafusion/src/converter/into_optd.rs | 221 +++++++++++---------- optd-datafusion/src/converter/mod.rs | 22 +- optd-datafusion/src/mock.rs | 8 +- 4 files changed, 157 insertions(+), 153 deletions(-) diff --git a/optd-datafusion/src/converter/from_optd.rs b/optd-datafusion/src/converter/from_optd.rs index ea64353..dc7cce8 100644 --- a/optd-datafusion/src/converter/from_optd.rs +++ b/optd-datafusion/src/converter/from_optd.rs @@ -1,5 +1,3 @@ -use std::{collections::HashMap, str::FromStr, sync::Arc}; - use anyhow::bail; use async_recursion::async_recursion; use datafusion::{ @@ -20,12 +18,13 @@ use optd_core::{ operators::{relational::physical::PhysicalOperator, scalar::ScalarOperator}, plans::{physical::PhysicalPlan, scalar::ScalarPlan}, }; +use std::{collections::HashMap, str::FromStr, sync::Arc}; -use super::OptdDFContext; +use super::OptdDataFusionContext; -impl OptdDFContext<'_> { +impl OptdDataFusionContext<'_> { #[async_recursion] - pub(crate) async fn conv_optd_to_df_relational( + pub(crate) async fn optd_to_df_relational( &self, optimized_plan: &PhysicalPlan, ) -> anyhow::Result> { @@ -33,7 +32,12 @@ impl OptdDFContext<'_> { PhysicalOperator::TableScan(table_scan) => { let source = self .tables - .get(table_scan.table_name.as_str().unwrap()) + .get( + table_scan + .table_name + .as_str() + .expect("Table name is not valid"), + ) .ok_or_else(|| anyhow::anyhow!("Table not found"))?; let provider = source_as_provider(source)?; @@ -42,12 +46,14 @@ impl OptdDFContext<'_> { let plan = provider .scan(self.session_state, None, &filters, None) .await?; + Ok(plan) } PhysicalOperator::Filter(filter) => { - let input_exec = self.conv_optd_to_df_relational(&filter.child).await?; + let input_exec = self.optd_to_df_relational(&filter.child).await?; let physical_expr = - Self::conv_optd_to_df_scalar(&filter.predicate, &input_exec.schema())?; + Self::optd_to_df_scalar(&filter.predicate, &input_exec.schema())?; + Ok( Arc::new(datafusion::physical_plan::filter::FilterExec::try_new( physical_expr, @@ -56,14 +62,12 @@ impl OptdDFContext<'_> { ) } PhysicalOperator::Project(project) => { - let input_exec = self.conv_optd_to_df_relational(&project.child).await?; + let input_exec = self.optd_to_df_relational(&project.child).await?; let physical_exprs = project .fields .iter() .cloned() - .filter_map(|field| { - Self::conv_optd_to_df_scalar(&field, &input_exec.schema()).ok() - }) + .filter_map(|field| Self::optd_to_df_scalar(&field, &input_exec.schema()).ok()) .enumerate() .map(|(idx, expr)| (expr, format!("col{}", idx))) .collect::, String)>>(); @@ -74,8 +78,8 @@ impl OptdDFContext<'_> { ) } PhysicalOperator::NestedLoopJoin(join) => { - let left_exec = self.conv_optd_to_df_relational(&join.outer).await?; - let right_exec = self.conv_optd_to_df_relational(&join.inner).await?; + let left_exec = self.optd_to_df_relational(&join.outer).await?; + let right_exec = self.optd_to_df_relational(&join.inner).await?; let filter_schema = { let fields = left_exec .schema() @@ -87,12 +91,11 @@ impl OptdDFContext<'_> { Schema::new_with_metadata(fields, HashMap::new()) }; - let physical_expr = Self::conv_optd_to_df_scalar( - &join.condition, - &Arc::new(filter_schema.clone()), - )?; + let physical_expr = + Self::optd_to_df_scalar(&join.condition, &Arc::new(filter_schema.clone()))?; - let join_type = JoinType::from_str(join.join_type.as_str().unwrap())?; + let join_type = + JoinType::from_str(join.join_type.as_str().expect("Invalid join type"))?; let mut column_idxs = vec![]; for i in 0..left_exec.schema().fields().len() { @@ -127,7 +130,7 @@ impl OptdDFContext<'_> { } } - pub(crate) fn conv_optd_to_df_scalar( + pub(crate) fn optd_to_df_scalar( pred: &ScalarPlan, context: &SchemaRef, ) -> anyhow::Result> { @@ -135,8 +138,8 @@ impl OptdDFContext<'_> { ScalarOperator::ColumnRef(column_ref) => { let idx = column_ref.column_index.as_i64().unwrap() as usize; Ok(Arc::new( - // Datafusion checks if col expr name matches the schema, so we have to supply the name inferred by datafusion, - // instead of using out own logical properties + // Datafusion checks if col expr name matches the schema, so we have to supply + // the name inferred by datafusion, instead of using out own logical properties. Column::new(context.fields()[idx].name(), idx), )) } @@ -151,11 +154,12 @@ impl OptdDFContext<'_> { OperatorData::Struct(..) => todo!(), OperatorData::Array(_) => todo!(), }; + Ok(Arc::new(Literal::new(value))) } ScalarOperator::BinaryOp(binary_op) => { - let left = Self::conv_optd_to_df_scalar(&binary_op.left, context)?; - let right = Self::conv_optd_to_df_scalar(&binary_op.right, context)?; + let left = Self::optd_to_df_scalar(&binary_op.left, context)?; + let right = Self::optd_to_df_scalar(&binary_op.right, context)?; // TODO(yuchen): really need the enums! let op = match binary_op.kind.as_str().unwrap() { "add" => Operator::Plus, @@ -163,10 +167,11 @@ impl OptdDFContext<'_> { "equal" => Operator::Eq, s => panic!("Unsupported binary operator: {}", s), }; + Ok(Arc::new(BinaryExpr::new(left, op, right)) as Arc) } ScalarOperator::UnaryOp(unary_op) => { - let child = Self::conv_optd_to_df_scalar(&unary_op.child, context)?; + let child = Self::optd_to_df_scalar(&unary_op.child, context)?; // TODO(yuchen): really need the enums! match unary_op.kind.as_str().unwrap() { "not" => Ok(Arc::new(NotExpr::new(child)) as Arc), @@ -181,14 +186,14 @@ impl OptdDFContext<'_> { s => bail!("Unsupported logic operator: {}", s), }; let mut children = logic_op.children.iter(); - let first_child = Self::conv_optd_to_df_scalar( + let first_child = Self::optd_to_df_scalar( children .next() .expect("LogicOp should have at least one child"), context, )?; children.try_fold(first_child, |acc, expr| { - let expr = Self::conv_optd_to_df_scalar(expr, context)?; + let expr = Self::optd_to_df_scalar(expr, context)?; Ok(Arc::new(BinaryExpr::new(acc, op, expr)) as Arc) }) } diff --git a/optd-datafusion/src/converter/into_optd.rs b/optd-datafusion/src/converter/into_optd.rs index bceff54..1be9f7d 100644 --- a/optd-datafusion/src/converter/into_optd.rs +++ b/optd-datafusion/src/converter/into_optd.rs @@ -1,9 +1,7 @@ -use std::sync::Arc; - use anyhow::bail; use datafusion::{ common::DFSchema, - logical_expr::{utils::conjunction, LogicalPlan as DFLogicalPlan, Operator}, + logical_expr::{utils::conjunction, LogicalPlan as DataFusionLogicalPlan, Operator}, prelude::Expr, }; use optd_core::{ @@ -21,12 +19,106 @@ use optd_core::{ }, plans::{logical::LogicalPlan, scalar::ScalarPlan}, }; +use std::sync::Arc; + +use super::OptdDataFusionContext; + +impl OptdDataFusionContext<'_> { + /// Given a DataFusion logical plan, returns an `optd` [`LogicalPlan`]. + pub(crate) fn df_to_optd_relational( + &mut self, + df_logical_plan: &DataFusionLogicalPlan, + ) -> anyhow::Result> { + let operator = match df_logical_plan { + DataFusionLogicalPlan::TableScan(table_scan) => { + let table_name = table_scan.table_name.to_quoted_string(); + + // Record the table name and source into the context. + self.tables.insert(table_name, table_scan.source.clone()); + + let combine_filters = conjunction(table_scan.filters.to_vec()); + let predicate = match combine_filters { + Some(df_expr) => { + let schema = DFSchema::try_from(table_scan.source.schema()).unwrap(); + Self::df_to_optd_scalar(&df_expr, &schema, 0)? + } + None => Arc::new(ScalarPlan { + operator: ScalarOperator::Constant(Constant { + value: OperatorData::Bool(true), + }), + }), + }; + + LogicalOperator::Scan(Scan::new( + &table_scan.table_name.to_quoted_string(), + predicate, + )) + } + DataFusionLogicalPlan::Projection(projection) => { + let child = self.df_to_optd_relational(projection.input.as_ref())?; + + let exprs = projection + .expr + .iter() + .map(|expr| Self::df_to_optd_scalar(expr, projection.input.schema(), 0)) + .collect::>>()?; + + LogicalOperator::Project(Project { + child, + fields: exprs, + }) + } + DataFusionLogicalPlan::Filter(df_filter) => LogicalOperator::Filter(Filter { + child: self.df_to_optd_relational(&df_filter.input)?, + predicate: Self::df_to_optd_scalar( + &df_filter.predicate, + df_filter.input.schema(), + 0, + )?, + }), + DataFusionLogicalPlan::Join(join) => { + let mut join_cond = join + .on + .iter() + .map(|(left, right)| { + let left = Self::df_to_optd_scalar(left, join.left.schema(), 0)?; + let offset = join.left.schema().fields().len(); + let right = Self::df_to_optd_scalar(right, join.right.schema(), offset)?; + Ok(Arc::new(ScalarPlan { + operator: binary_op::equal(left, right), + })) + }) + .collect::>>()?; -use super::OptdDFContext; + if let Some(filter) = &join.filter { + let filter = + Self::df_to_optd_scalar(filter, df_logical_plan.schema().as_ref(), 0)?; + join_cond.push(filter); + } + + if join_cond.is_empty() { + join_cond.push(Arc::new(ScalarPlan { + operator: constants::boolean(true), + })); + } -impl OptdDFContext<'_> { - /// The col_offset is an offset added to the column index for all column references. It is useful for joins. - pub(crate) fn conv_df_to_optd_scalar( + LogicalOperator::Join(Join::new( + &join.join_type.to_string(), + self.df_to_optd_relational(&join.left)?, + self.df_to_optd_relational(&join.right)?, + Self::flatten_scalar_as_conjunction(&join_cond, 0), + )) + } + logical_plan => bail!("optd does not support this operator {:?}", logical_plan), + }; + Ok(Arc::new(LogicalPlan { operator })) + } + + /// Given a DataFusion [`Expr`], returns an `optd` [`ScalarPlan`]. + /// + /// The `col_offset` input is an offset added to the column index for all column references, + /// which is useful for joins. + pub(crate) fn df_to_optd_scalar( df_expr: &Expr, context: &DFSchema, col_offset: usize, @@ -34,7 +126,7 @@ impl OptdDFContext<'_> { let operator = match df_expr { Expr::Column(column) => ScalarOperator::ColumnRef(ColumnRef { column_index: OperatorData::Int64( - (context.index_of_column(column).unwrap() + col_offset) as i64, + (context.index_of_column(column)? + col_offset) as i64, ), }), Expr::Literal(scalar_value) => match scalar_value { @@ -52,8 +144,8 @@ impl OptdDFContext<'_> { _ => panic!("optd Only supports a limited number of literals"), }, Expr::BinaryExpr(binary_expr) => { - let left = Self::conv_df_to_optd_scalar(&binary_expr.left, context, col_offset)?; - let right = Self::conv_df_to_optd_scalar(&binary_expr.right, context, col_offset)?; + let left = Self::df_to_optd_scalar(&binary_expr.left, context, col_offset)?; + let right = Self::df_to_optd_scalar(&binary_expr.right, context, col_offset)?; match binary_expr.op { Operator::Plus => binary_op::add(left, right), Operator::Minus => binary_op::minus(left, right), @@ -64,13 +156,11 @@ impl OptdDFContext<'_> { _ => todo!(), } } - Expr::Not(expr) => unary_op::not(Self::conv_df_to_optd_scalar( - expr.as_ref(), - context, - col_offset, - )?), + Expr::Not(expr) => { + unary_op::not(Self::df_to_optd_scalar(expr.as_ref(), context, col_offset)?) + } Expr::Cast(cast) => { - return Self::conv_df_to_optd_scalar(&cast.expr, context, col_offset); + return Self::df_to_optd_scalar(&cast.expr, context, col_offset); } _ => panic!( "optd does not support this scalar expression: {:#?}", @@ -81,104 +171,21 @@ impl OptdDFContext<'_> { Ok(Arc::new(ScalarPlan { operator })) } + /// Flattens a vector of scalar plans into a single scalar conjucntion tree. The `left_index` + /// parameter specifies the index of the left side of the conjugation. fn flatten_scalar_as_conjunction( - join_cond: Vec>, - idx: usize, + join_cond: &[Arc], + left_index: usize, ) -> Arc { - if idx == join_cond.len() - 1 { - join_cond[idx].clone() + if left_index == join_cond.len() - 1 { + join_cond[left_index].clone() } else { Arc::new(ScalarPlan { operator: logic_op::and(vec![ - join_cond[idx].clone(), - Self::flatten_scalar_as_conjunction(join_cond.clone(), idx + 1), + join_cond[left_index].clone(), + Self::flatten_scalar_as_conjunction(join_cond, left_index + 1), ]), }) } } - - pub(crate) fn conv_df_to_optd_relational( - &mut self, - df_logical_plan: &DFLogicalPlan, - ) -> anyhow::Result> { - let operator = match df_logical_plan { - DFLogicalPlan::Filter(df_filter) => LogicalOperator::Filter(Filter { - child: self.conv_df_to_optd_relational(&df_filter.input)?, - predicate: Self::conv_df_to_optd_scalar( - &df_filter.predicate, - df_filter.input.schema(), - 0, - )?, - }), - DFLogicalPlan::Join(join) => { - let mut join_cond = Vec::new(); - for (left, right) in &join.on { - let left = Self::conv_df_to_optd_scalar(left, join.left.schema(), 0)?; - let offset = join.left.schema().fields().len(); - let right = Self::conv_df_to_optd_scalar(right, join.right.schema(), offset)?; - join_cond.push(Arc::new(ScalarPlan { - operator: binary_op::equal(left, right), - })); - } - if let Some(filter) = &join.filter { - let filter = - Self::conv_df_to_optd_scalar(filter, df_logical_plan.schema().as_ref(), 0)?; - join_cond.push(filter); - } - if join_cond.is_empty() { - join_cond.push(Arc::new(ScalarPlan { - operator: constants::boolean(true), - })); - } - - LogicalOperator::Join(Join::new( - &join.join_type.to_string(), - self.conv_df_to_optd_relational(&join.left)?, - self.conv_df_to_optd_relational(&join.right)?, - Self::flatten_scalar_as_conjunction(join_cond, 0), - )) - } - DFLogicalPlan::TableScan(table_scan) => { - let table_name = table_scan.table_name.to_quoted_string(); - - let combine_filters = conjunction(table_scan.filters.to_vec()); - let scan = LogicalOperator::Scan(Scan::new( - &table_scan.table_name.to_quoted_string(), - match combine_filters { - Some(df_expr) => { - let schema = DFSchema::try_from(table_scan.source.schema()).unwrap(); - Self::conv_df_to_optd_scalar(&df_expr, &schema, 0)? - } - None => Arc::new(ScalarPlan { - operator: ScalarOperator::Constant(Constant { - value: OperatorData::Bool(true), - }), - }), - }, - )); - - self.tables.insert(table_name, table_scan.source.clone()); - - scan - } - DFLogicalPlan::Projection(projection) => { - let input = self.conv_df_to_optd_relational(projection.input.as_ref())?; - let mut exprs = Vec::new(); - for expr in &projection.expr { - exprs.push(Self::conv_df_to_optd_scalar( - expr, - projection.input.schema(), - 0, - )?); - } - - LogicalOperator::Project(Project { - child: input, - fields: exprs, - }) - } - logical_plan => bail!("optd does not support this operator {:?}", logical_plan), - }; - Ok(Arc::new(LogicalPlan { operator })) - } } diff --git a/optd-datafusion/src/converter/mod.rs b/optd-datafusion/src/converter/mod.rs index 6349f51..04cfa42 100644 --- a/optd-datafusion/src/converter/mod.rs +++ b/optd-datafusion/src/converter/mod.rs @@ -1,30 +1,22 @@ -use std::{collections::HashMap, sync::Arc}; - use datafusion::{execution::SessionState, logical_expr::TableSource}; +use std::{collections::HashMap, sync::Arc}; pub mod from_optd; pub mod into_optd; /// A context for converting between optd and datafusion. /// The map is used to lookup table sources when converting TableScan operators from optd to datafusion. -pub(crate) struct OptdDFContext<'a> { +pub(crate) struct OptdDataFusionContext<'a> { /// Maps table names to table sources. pub tables: HashMap>, + /// DataFusion session state. pub session_state: &'a SessionState, } -impl OptdDFContext<'_> { - /// Creates a new `OptdDFContext` with the provided session state. - /// - /// # Arguments - /// - /// * `session_state` - A reference to the `SessionState` used for conversions. - /// - /// # Returns - /// - /// A `OptdDFContext` containing an empty table map and the provided session state. - pub(crate) fn new(session_state: &SessionState) -> OptdDFContext { - OptdDFContext { +impl OptdDataFusionContext<'_> { + /// Creates a new empty `OptdDataFusionContext` with the provided session state. + pub(crate) fn new(session_state: &SessionState) -> OptdDataFusionContext { + OptdDataFusionContext { tables: HashMap::new(), session_state, } diff --git a/optd-datafusion/src/mock.rs b/optd-datafusion/src/mock.rs index f9e245b..1e3574d 100644 --- a/optd-datafusion/src/mock.rs +++ b/optd-datafusion/src/mock.rs @@ -1,4 +1,4 @@ -use crate::converter::OptdDFContext; +use crate::converter::OptdDataFusionContext; use async_trait::async_trait; use datafusion::{ common::Result as DataFusionResult, @@ -111,11 +111,11 @@ impl QueryPlanner for MockOptdOptimizer { )); } - let mut converter = OptdDFContext::new(session_state); + let mut converter = OptdDataFusionContext::new(session_state); // convert the DataFusion logical plan to `optd`'s version of a `LogicalPlan`. let logical_plan = converter - .conv_df_to_optd_relational(datafusion_logical_plan) + .df_to_optd_relational(datafusion_logical_plan) .expect("TODO FIX ERROR HANDLING"); // Run the `optd` optimizer on the `LogicalPlan`. @@ -126,7 +126,7 @@ impl QueryPlanner for MockOptdOptimizer { // Convert the output `optd` `PhysicalPlan` to DataFusion's `ExecutionPlan`. let physical_plan = converter - .conv_optd_to_df_relational(&optd_optimized_physical_plan) + .optd_to_df_relational(&optd_optimized_physical_plan) .await .expect("TODO FIX ERROR HANDLING"); From b333d4b0cfba57c433b02889f14a3094e1e5cbd8 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 2 Mar 2025 16:18:18 -0500 Subject: [PATCH 6/7] add readme --- optd-datafusion/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 optd-datafusion/README.md diff --git a/optd-datafusion/README.md b/optd-datafusion/README.md new file mode 100644 index 0000000..4330c71 --- /dev/null +++ b/optd-datafusion/README.md @@ -0,0 +1,8 @@ +# Demo + +To run the demo, execute the following command: + +```sh +$ cargo run --example -p optd-datafusion --example demo -- +$ cargo run --example -p optd-datafusion --example demo -- optd-datafusion/sql/test_join.sql +``` From e321bd5419c646c2d0798a53ce7e0e7c7ded9a45 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Sun, 2 Mar 2025 16:32:32 -0500 Subject: [PATCH 7/7] rename to OptdContext --- optd-datafusion/src/converter/from_optd.rs | 6 ++--- optd-datafusion/src/converter/into_optd.rs | 4 ++-- optd-datafusion/src/converter/mod.rs | 27 +++++++++++++++------- optd-datafusion/src/mock.rs | 8 +++---- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/optd-datafusion/src/converter/from_optd.rs b/optd-datafusion/src/converter/from_optd.rs index dc7cce8..9de171c 100644 --- a/optd-datafusion/src/converter/from_optd.rs +++ b/optd-datafusion/src/converter/from_optd.rs @@ -20,9 +20,9 @@ use optd_core::{ }; use std::{collections::HashMap, str::FromStr, sync::Arc}; -use super::OptdDataFusionContext; +use super::OptdContext; -impl OptdDataFusionContext<'_> { +impl OptdContext { #[async_recursion] pub(crate) async fn optd_to_df_relational( &self, @@ -44,7 +44,7 @@ impl OptdDataFusionContext<'_> { // TODO(yuchen): support filters inside table scan. let filters = vec![]; let plan = provider - .scan(self.session_state, None, &filters, None) + .scan(&self.session_state, None, &filters, None) .await?; Ok(plan) diff --git a/optd-datafusion/src/converter/into_optd.rs b/optd-datafusion/src/converter/into_optd.rs index 1be9f7d..5bce3fa 100644 --- a/optd-datafusion/src/converter/into_optd.rs +++ b/optd-datafusion/src/converter/into_optd.rs @@ -21,9 +21,9 @@ use optd_core::{ }; use std::sync::Arc; -use super::OptdDataFusionContext; +use super::OptdContext; -impl OptdDataFusionContext<'_> { +impl OptdContext { /// Given a DataFusion logical plan, returns an `optd` [`LogicalPlan`]. pub(crate) fn df_to_optd_relational( &mut self, diff --git a/optd-datafusion/src/converter/mod.rs b/optd-datafusion/src/converter/mod.rs index 04cfa42..ba07afa 100644 --- a/optd-datafusion/src/converter/mod.rs +++ b/optd-datafusion/src/converter/mod.rs @@ -1,24 +1,35 @@ use datafusion::{execution::SessionState, logical_expr::TableSource}; +use std::fmt::Debug; use std::{collections::HashMap, sync::Arc}; pub mod from_optd; pub mod into_optd; /// A context for converting between optd and datafusion. -/// The map is used to lookup table sources when converting TableScan operators from optd to datafusion. -pub(crate) struct OptdDataFusionContext<'a> { +/// The map is used to lookup table sources when converting TableScan operators from optd to +/// datafusion. +pub(crate) struct OptdContext { /// Maps table names to table sources. - pub tables: HashMap>, + tables: HashMap>, /// DataFusion session state. - pub session_state: &'a SessionState, + session_state: SessionState, } -impl OptdDataFusionContext<'_> { +impl OptdContext { /// Creates a new empty `OptdDataFusionContext` with the provided session state. - pub(crate) fn new(session_state: &SessionState) -> OptdDataFusionContext { - OptdDataFusionContext { + pub(crate) fn new(session_state: &SessionState) -> OptdContext { + OptdContext { tables: HashMap::new(), - session_state, + session_state: session_state.clone(), } } } + +impl Debug for OptdContext { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OptdContext") + .field("tables", &self.tables.keys()) + .field("session_state", &self.session_state) + .finish() + } +} diff --git a/optd-datafusion/src/mock.rs b/optd-datafusion/src/mock.rs index 1e3574d..656ffba 100644 --- a/optd-datafusion/src/mock.rs +++ b/optd-datafusion/src/mock.rs @@ -1,4 +1,4 @@ -use crate::converter::OptdDataFusionContext; +use crate::converter::OptdContext; use async_trait::async_trait; use datafusion::{ common::Result as DataFusionResult, @@ -111,10 +111,10 @@ impl QueryPlanner for MockOptdOptimizer { )); } - let mut converter = OptdDataFusionContext::new(session_state); + let mut optd_ctx = OptdContext::new(session_state); // convert the DataFusion logical plan to `optd`'s version of a `LogicalPlan`. - let logical_plan = converter + let logical_plan = optd_ctx .df_to_optd_relational(datafusion_logical_plan) .expect("TODO FIX ERROR HANDLING"); @@ -125,7 +125,7 @@ impl QueryPlanner for MockOptdOptimizer { .expect("TODO FIX ERROR HANDLING"); // Convert the output `optd` `PhysicalPlan` to DataFusion's `ExecutionPlan`. - let physical_plan = converter + let physical_plan = optd_ctx .optd_to_df_relational(&optd_optimized_physical_plan) .await .expect("TODO FIX ERROR HANDLING");