Skip to content

Commit

Permalink
catalog.has_header true be default
Browse files Browse the repository at this point in the history
  • Loading branch information
korowa committed Aug 15, 2024
1 parent e24a5dd commit 57fbd5e
Show file tree
Hide file tree
Showing 13 changed files with 62 additions and 26 deletions.
2 changes: 1 addition & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ config_namespace! {

/// Default value for `format.has_header` for `CREATE EXTERNAL TABLE`
/// if not specified explicitly in the statement.
pub has_header: bool, default = false
pub has_header: bool, default = true

/// Specifies whether newlines in (quoted) CSV values are supported.
///
Expand Down
2 changes: 1 addition & 1 deletion datafusion/common/src/file_options/csv_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {

fn try_from(value: &CsvOptions) -> Result<Self> {
let mut builder = WriterBuilder::default()
.with_header(value.has_header.unwrap_or(false))
.with_header(value.has_header.unwrap_or(true))
.with_quote(value.quote)
.with_delimiter(value.delimiter);

Expand Down
23 changes: 21 additions & 2 deletions datafusion/core/src/datasource/file_format/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,15 +369,34 @@ impl FileFormat for CsvFormat {
async fn create_writer_physical_plan(
&self,
input: Arc<dyn ExecutionPlan>,
_state: &SessionState,
state: &SessionState,
conf: FileSinkConfig,
order_requirements: Option<Vec<PhysicalSortRequirement>>,
) -> Result<Arc<dyn ExecutionPlan>> {
if conf.overwrite {
return not_impl_err!("Overwrites are not implemented yet for CSV");
}

let writer_options = CsvWriterOptions::try_from(&self.options)?;
// `has_header` and `newlines_in_values` fields of CsvOptions may inherit
// their values from session from configuration settings. To support
// this logic, writer options are built from the copy of `self.options`
// with updated values of these special fields.
let has_header = self
.options()
.has_header
.unwrap_or(state.config_options().catalog.has_header);
let newlines_in_values = self
.options()
.newlines_in_values
.unwrap_or(state.config_options().catalog.newlines_in_values);

let options = self
.options()
.clone()
.with_has_header(has_header)
.with_newlines_in_values(newlines_in_values);

let writer_options = CsvWriterOptions::try_from(&options)?;

let sink_schema = conf.output_schema().clone();
let sink = Arc::new(CsvSink::new(conf, writer_options));
Expand Down
13 changes: 10 additions & 3 deletions datafusion/core/tests/user_defined/user_defined_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,11 @@ async fn exec_sql(ctx: &SessionContext, sql: &str) -> Result<String> {

/// Create a test table.
async fn setup_table(ctx: SessionContext) -> Result<SessionContext> {
let sql = "CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT) STORED AS CSV location 'tests/data/customer.csv'";
let sql = "
CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT)
STORED AS CSV location 'tests/data/customer.csv'
OPTIONS('format.has_header' 'false')
";

let expected = vec!["++", "++"];

Expand All @@ -125,8 +129,11 @@ async fn setup_table(ctx: SessionContext) -> Result<SessionContext> {
}

async fn setup_table_without_schemas(ctx: SessionContext) -> Result<SessionContext> {
let sql =
"CREATE EXTERNAL TABLE sales STORED AS CSV location 'tests/data/customer.csv'";
let sql = "
CREATE EXTERNAL TABLE sales
STORED AS CSV location 'tests/data/customer.csv'
OPTIONS('format.has_header' 'false')
";

let expected = vec!["++", "++"];

Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ COPY source_table to 'test_files/scratch/copy/table_csv' STORED AS CSV OPTIONS

# validate folder of csv files
statement ok
CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.compression' 'gzip');
CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.has_header' false, 'format.compression' gzip);

query IT
select * from validate_csv;
Expand All @@ -427,7 +427,7 @@ select * from validate_csv;

# Copy from table to single csv
query I
COPY source_table to 'test_files/scratch/copy/table.csv';
COPY source_table to 'test_files/scratch/copy/table.csv' OPTIONS ('format.has_header' false);
----
2

Expand Down Expand Up @@ -478,7 +478,7 @@ query I
COPY source_table
to 'test_files/scratch/copy/table_csv_with_options'
STORED AS CSV OPTIONS (
'format.has_header' false,
'format.has_header' true,
'format.compression' uncompressed,
'format.datetime_format' '%FT%H:%M:%S.%9f',
'format.delimiter' ';',
Expand Down
11 changes: 6 additions & 5 deletions datafusion/sqllogictest/test_files/csv_files.slt
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,14 @@ CREATE TABLE src_table_2 (

query I
COPY src_table_1 TO 'test_files/scratch/csv_files/csv_partitions/1.csv'
STORED AS CSV;
STORED AS CSV OPTIONS ('format.has_header' 'false');
----
4


query I
COPY src_table_2 TO 'test_files/scratch/csv_files/csv_partitions/2.csv'
STORED AS CSV;
STORED AS CSV OPTIONS ('format.has_header' 'false');
----
4

Expand Down Expand Up @@ -210,7 +210,7 @@ COPY (VALUES
('#second line is a comment'),
('2,3'))
TO 'test_files/scratch/csv_files/file_with_comments.csv'
OPTIONS ('format.delimiter' '|');
OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE stored_table_with_comments (
Expand All @@ -219,7 +219,8 @@ CREATE EXTERNAL TABLE stored_table_with_comments (
) STORED AS CSV
LOCATION 'test_files/scratch/csv_files/file_with_comments.csv'
OPTIONS ('format.comment' '#',
'format.delimiter' ',');
'format.delimiter' ',',
'format.has_header' 'false');

query TT
SELECT * from stored_table_with_comments;
Expand Down Expand Up @@ -315,7 +316,7 @@ col1 TEXT,
col2 TEXT
) STORED AS CSV
LOCATION '../core/tests/data/newlines_in_values.csv'
OPTIONS ('format.newlines_in_values' 'true');
OPTIONS ('format.newlines_in_values' 'true', 'format.has_header' 'false');

query TT
select * from stored_table_with_newlines_in_values_safe;
Expand Down
7 changes: 5 additions & 2 deletions datafusion/sqllogictest/test_files/ddl.slt
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,9 @@ statement ok
CREATE EXTERNAL TABLE csv_with_timestamps (
name VARCHAR,
ts TIMESTAMP
) STORED AS CSV LOCATION '../core/tests/data/timestamps.csv';
) STORED AS CSV
LOCATION '../core/tests/data/timestamps.csv'
OPTIONS('format.has_header' 'false');

query TP
SELECT * from csv_with_timestamps
Expand All @@ -496,7 +498,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps (
)
STORED AS CSV
PARTITIONED BY (c_date)
LOCATION '../core/tests/data/partitioned_table';
LOCATION '../core/tests/data/partitioned_table'
OPTIONS('format.has_header' 'false');

query TPD
SELECT * from csv_with_timestamps where c_date='2018-11-13'
Expand Down
3 changes: 2 additions & 1 deletion datafusion/sqllogictest/test_files/group_by.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4264,7 +4264,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps (
)
STORED AS CSV
WITH ORDER (ts DESC)
LOCATION '../core/tests/data/timestamps.csv';
LOCATION '../core/tests/data/timestamps.csv'
OPTIONS('format.has_header' 'false');

# below query should run since it operates on a bounded source and have a sort
# at the top of its plan.
Expand Down
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ datafusion.catalog.create_default_catalog_and_schema true
datafusion.catalog.default_catalog datafusion
datafusion.catalog.default_schema public
datafusion.catalog.format NULL
datafusion.catalog.has_header false
datafusion.catalog.has_header true
datafusion.catalog.information_schema true
datafusion.catalog.location NULL
datafusion.catalog.newlines_in_values false
Expand Down Expand Up @@ -255,7 +255,7 @@ datafusion.catalog.create_default_catalog_and_schema true Whether the default ca
datafusion.catalog.default_catalog datafusion The default catalog name - this impacts what SQL queries use if not specified
datafusion.catalog.default_schema public The default schema name - this impacts what SQL queries use if not specified
datafusion.catalog.format NULL Type of `TableProvider` to use when loading `default` schema
datafusion.catalog.has_header false Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.
datafusion.catalog.has_header true Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.
datafusion.catalog.information_schema true Should DataFusion provide access to `information_schema` virtual tables for displaying schema information
datafusion.catalog.location NULL Location scanned to load tables for `default` schema
datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/limit.slt
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ drop table aggregate_test_100;
query I
COPY (select * from (values
(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')
)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV;
)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV OPTIONS ('format.has_header' 'false');
----
5

Expand Down
6 changes: 4 additions & 2 deletions datafusion/sqllogictest/test_files/order.slt
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ NULL three

statement ok
CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'
OPTIONS('format.has_header' 'false');

# Demonstrate types
query TTT
Expand Down Expand Up @@ -463,7 +464,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps (
)
STORED AS CSV
WITH ORDER (ts ASC NULLS LAST)
LOCATION '../core/tests/data/timestamps.csv';
LOCATION '../core/tests/data/timestamps.csv'
OPTIONS('format.has_header' 'false');

query TT
EXPLAIN SELECT DATE_BIN(INTERVAL '15 minutes', ts, TIMESTAMP '2022-08-03 14:40:00Z') as db15
Expand Down
6 changes: 4 additions & 2 deletions datafusion/sqllogictest/test_files/projection.slt
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@ CREATE TABLE cpu_load_short(host STRING NOT NULL) AS VALUES

statement ok
CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'
OPTIONS('format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE test_simple (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv'
OPTIONS('format.has_header' 'false');

# projection same fields
query I rowsort
Expand Down
3 changes: 2 additions & 1 deletion datafusion/sqllogictest/test_files/window.slt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ OPTIONS ('format.has_header' 'true');
### execute_with_partition with 4 partitions
statement ok
CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'
OPTIONS('format.has_header' 'false');


# for window functions without order by the first, last, and nth function call does not make sense
Expand Down

0 comments on commit 57fbd5e

Please sign in to comment.