Skip to content

Commit

Permalink
Fix file names with splitting and no-empty-files
Browse files Browse the repository at this point in the history
  • Loading branch information
pacman82 committed Feb 21, 2024
1 parent 9f46209 commit cb28bc3
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "odbc2parquet"
version = "5.1.0"
version = "5.1.1"
authors = ["Markus Klein"]
edition = "2021"
repository = "https://github.com/pacman82/odbc2parquet"
Expand Down
4 changes: 4 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## 5.1.1

* Fix: 5.1.0 introduced a regression, which caused output file enumeration to be start with a suffix of `2` instead of `1` if in addition to file splitting the `--no-empty-file` flag had also been set.

## 5.1.0

* Additional log message with info level emitting the number of total number of rows written, the file size and the path for each file.
Expand Down
10 changes: 5 additions & 5 deletions src/query/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,22 +114,22 @@ impl FileWriter {

let current_path = Self::current_path(&path, suffix)?;

let current_file = if !options.no_empty_file {
Some(CurrentFile::new(
let (current_file, num_file) = if !options.no_empty_file {
(Some(CurrentFile::new(
current_path,
schema.clone(),
properties.clone(),
)?)
)?), 1)
} else {
None
(None, 0)
};

Ok(Self {
base_path: path,
schema,
properties,
file_size: options.file_size,
num_file: 1,
num_file,
suffix_length: options.suffix_length,
current_file,
})
Expand Down
80 changes: 80 additions & 0 deletions tests/integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,37 @@ fn no_empty_file_works_with_split_files() {
);
}

/// If the query does not come back empty, of course a file should be created, even if the
/// `--no-empty-file` flag is set.
#[test]
fn emit_file_despite_no_empty_file_set() {
// A temporary directory, to be removed at the end of the test.
let out_dir = tempdir().unwrap();
// The name of the output parquet file we are going to write. Since it is in a temporary
// directory it will not outlive the end of the test.
let out_path = out_dir.path().join("out.par");
// We need to pass the output path as a string argument.
let out_str = out_path.to_str().expect("Temporary file path must be utf8");

let query = "SELECT 42 as a";

Command::cargo_bin("odbc2parquet")
.unwrap()
.args([
"-vvvv",
"query",
"--connection-string",
MSSQL,
out_str,
query,
])
.assert()
.success();

let expected = "{a: 42}\n";
parquet_read_out(out_str).stdout(eq(expected));
}

/// Should read query from standard input if "-" is provided as query text.
#[test]
fn read_query_from_stdin() {
Expand Down Expand Up @@ -1387,6 +1418,55 @@ fn split_files_on_num_row_groups() {
parquet_read_out(out_dir.path().join("out_03.par").to_str().unwrap());
}

/// Verify naming of the files is with successive numbers starting from 1 to 3 with split files and
/// `--no-empty-file` flag set. This was messed up, with a refactoring once and file names started
/// with `2` instead of `1``.
#[test]
fn file_name_no_empty_file_and_split_files() {
// Setup table for test
let table_name = "FileNameNoEmptyFileAndSplitFiles";
let conn = ENV
.connect_with_connection_string(MSSQL, ConnectionOptions::default())
.unwrap();
setup_empty_table_mssql(&conn, table_name, &["INTEGER"]).unwrap();
let insert = format!("INSERT INTO {table_name} (A) VALUES(1),(2),(3)");
conn.execute(&insert, ()).unwrap();

// A temporary directory, to be removed at the end of the test.
let out_dir = tempdir().unwrap();
// The name of the output parquet file we are going to write. Since it is in a temporary
// directory it will not outlive the end of the test.
let out_path = out_dir.path().join("out.par");
// We need to pass the output path as a string argument.
let out_str = out_path.to_str().expect("Temporary file path must be utf8");

let query = format!("SELECT a FROM {table_name}");

Command::cargo_bin("odbc2parquet")
.unwrap()
.args([
"-vvvv",
"query",
out_str,
"--connection-string",
MSSQL,
"--no-empty-file",
"--batch-size-row",
"1",
"--row-groups-per-file",
"1",
&query,
])
.assert()
.success();

// Expect one file per row in table (3)

parquet_read_out(out_dir.path().join("out_01.par").to_str().unwrap());
parquet_read_out(out_dir.path().join("out_02.par").to_str().unwrap());
parquet_read_out(out_dir.path().join("out_03.par").to_str().unwrap());
}

#[test]
fn split_files_on_size_limit() {
// Setup table for test
Expand Down

0 comments on commit cb28bc3

Please sign in to comment.