From 6b9380838c3d0da269e9d7bed98bf406bf68d1df Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Sat, 4 Nov 2023 16:09:42 -0600 Subject: [PATCH 1/2] Force exception on None in non-nullable column, clean up test warnings --- .github/workflows/on_push.yml | 2 + CHANGELOG.md | 6 ++ clickhouse_connect/__version__.py | 2 +- clickhouse_connect/cc_sqlalchemy/dialect.py | 1 + clickhouse_connect/datatypes/string.py | 4 +- clickhouse_connect/driver/dataconv.py | 7 +- clickhouse_connect/driverc/dataconv.pyx | 91 ++++++++++--------- tests/integration_tests/test_pandas.py | 15 ++- .../test_sqlalchemy/test_basics.py | 20 ++-- 9 files changed, 90 insertions(+), 58 deletions(-) diff --git a/.github/workflows/on_push.yml b/.github/workflows/on_push.yml index 601de6f2..13fe7156 100644 --- a/.github/workflows/on_push.yml +++ b/.github/workflows/on_push.yml @@ -98,6 +98,7 @@ jobs: env: CLICKHOUSE_CONNECT_TEST_DOCKER: 'False' CLICKHOUSE_CONNECT_TEST_FUZZ: 50 + SQLALCHEMY_SILENCE_UBER_WARNING: 1 run: pytest tests cloud-tests: @@ -134,4 +135,5 @@ jobs: CLICKHOUSE_CONNECT_TEST_INSERT_QUORUM: 3 CLICKHOUSE_CONNECT_TEST_HOST: ${{ secrets.INTEGRATIONS_TEAM_TESTS_CLOUD_HOST }} CLICKHOUSE_CONNECT_TEST_PASSWORD: ${{ secrets.INTEGRATIONS_TEAM_TESTS_CLOUD_PASSWORD }} + SQLALCHEMY_SILENCE_UBER_WARNING: 1 run: pytest tests/integration_tests diff --git a/CHANGELOG.md b/CHANGELOG.md index 6990d2c9..fa8eb5c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,12 @@ In any case, this should not affect the basic usage of Superset with ClickHouse. your Superset installation, the ClickHouse datasource will be available with either the enhanced connection dialog or a standard SqlAlchemy DSN in the form of `clickhousedb://{username}:{password}@{host}:{port}`. +## 0.6.19, TBD +### Bug Fix +- In some circumstances it was possible to insert a `None` value into a non-Nullable String column. As this could mask +invalid input data, any attempt to insert None into a non-Nullable String or LowCardinality(String) will now throw +a DataError + ## 0.6.18, 2023-10-25 ### Bug Fixes - Reduce the estimated insert block size from 16-32MB to 1-2MB for large inserts. The large data transfers could cause diff --git a/clickhouse_connect/__version__.py b/clickhouse_connect/__version__.py index 15ce74c8..3b0f77b5 100644 --- a/clickhouse_connect/__version__.py +++ b/clickhouse_connect/__version__.py @@ -1 +1 @@ -version = '0.6.18' +version = '0.6.19' diff --git a/clickhouse_connect/cc_sqlalchemy/dialect.py b/clickhouse_connect/cc_sqlalchemy/dialect.py index 5eb1ed01..6b04c7e2 100644 --- a/clickhouse_connect/cc_sqlalchemy/dialect.py +++ b/clickhouse_connect/cc_sqlalchemy/dialect.py @@ -22,6 +22,7 @@ class ClickHouseDialect(DefaultDialect): default_schema_name = 'default' supports_native_decimal = True supports_native_boolean = True + supports_statement_cache = False returns_unicode_strings = True postfetch_lastrowid = False ddl_compiler = ChDDLCompiler diff --git a/clickhouse_connect/datatypes/string.py b/clickhouse_connect/datatypes/string.py index 7f6b7a5a..0b5a81cb 100644 --- a/clickhouse_connect/datatypes/string.py +++ b/clickhouse_connect/datatypes/string.py @@ -45,9 +45,9 @@ def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence: # pylint: disable=duplicate-code,too-many-nested-blocks,too-many-branches def _write_column_binary(self, column: Union[Sequence, MutableSequence], dest: bytearray, ctx: InsertContext): encoding = None - if isinstance(self._first_value(column), str): + if not isinstance(self._first_value(column), bytes): encoding = ctx.encoding or self.encoding - data_conv.write_str_col(column, encoding, dest) + data_conv.write_str_col(column, self.nullable, encoding, dest) def _active_null(self, ctx): if ctx.use_none: diff --git a/clickhouse_connect/driver/dataconv.py b/clickhouse_connect/driver/dataconv.py index 29c96a9a..24272725 100644 --- a/clickhouse_connect/driver/dataconv.py +++ b/clickhouse_connect/driver/dataconv.py @@ -5,6 +5,7 @@ from uuid import UUID, SafeUUID from clickhouse_connect.driver.common import int_size +from clickhouse_connect.driver.exceptions import DataError from clickhouse_connect.driver.types import ByteSource from clickhouse_connect.driver.options import np @@ -110,14 +111,18 @@ def pivot(data: Sequence[Sequence], start_row: int, end_row: int) -> Sequence[Se return tuple(zip(*data[start_row: end_row])) -def write_str_col(column: Sequence, encoding: Optional[str], dest: bytearray): +def write_str_col(column: Sequence, nullable: bool, encoding: Optional[str], dest: bytearray): app = dest.append for x in column: if not x: + if not nullable and x is None: + raise DataError('Invalid None value in non-Nullable column') app(0) else: if encoding: x = x.encode(encoding) + else: + x = b'' sz = len(x) while True: b = sz & 0x7f diff --git a/clickhouse_connect/driverc/dataconv.pyx b/clickhouse_connect/driverc/dataconv.pyx index dab91141..0c9bbc19 100644 --- a/clickhouse_connect/driverc/dataconv.pyx +++ b/clickhouse_connect/driverc/dataconv.pyx @@ -19,6 +19,7 @@ from uuid import UUID, SafeUUID from libc.string cimport memcpy from datetime import tzinfo +from clickhouse_connect.driver.exceptions import DataError @cython.boundscheck(False) @cython.wraparound(False) @@ -254,7 +255,7 @@ cdef inline extend_byte_array(target: bytearray, int start, object source, Py_ss @cython.boundscheck(False) @cython.wraparound(False) -def write_str_col(column: Sequence, encoding: Optional[str], dest: bytearray): +def write_str_col(column: Sequence, nullable: bool, encoding: Optional[str], dest: bytearray): cdef unsigned long long buff_size = len(column) << 5 cdef unsigned long long buff_loc = 0, sz = 0, dsz = 0 cdef unsigned long long array_size = PyByteArray_GET_SIZE(dest) @@ -263,50 +264,54 @@ def write_str_col(column: Sequence, encoding: Optional[str], dest: bytearray): cdef object encoded cdef char b cdef char * data - for x in column: - if not x: - temp_buff[buff_loc] = 0 - buff_loc += 1 - if buff_loc == buff_size: - extend_byte_array(dest, array_size, mv, buff_loc) - array_size += buff_loc - buff_loc = 0 - else: - if not encoding: - data = x - dsz = len(x) - else: - encoded = x.encode(encoding) - dsz = len(encoded) - data = encoded - sz = dsz - while True: - b = sz & 0x7f - sz >>= 7 - if sz != 0: - b |= 0x80 - temp_buff[buff_loc] = b + try: + for x in column: + if not x: + if not nullable and x is None: + raise DataError('Invalid None value in non-Nullable column') + temp_buff[buff_loc] = 0 buff_loc += 1 if buff_loc == buff_size: extend_byte_array(dest, array_size, mv, buff_loc) array_size += buff_loc buff_loc = 0 - if sz == 0: - break - if dsz + buff_loc >= buff_size: - if buff_loc > 0: # Write what we have so far - extend_byte_array(dest, array_size, mv, buff_loc) - array_size += buff_loc - buff_loc = 0 - if (dsz << 4) > buff_size: # resize our buffer for very large strings - PyMem_Free( temp_buff) - mv.release() - buff_size = dsz << 6 - temp_buff = PyMem_Malloc( buff_size) - mv = PyMemoryView_FromMemory(temp_buff, buff_size, PyBUF_READ) - memcpy(temp_buff + buff_loc, data, dsz) - buff_loc += dsz - if buff_loc > 0: - extend_byte_array(dest, array_size, mv, buff_loc) - mv.release() - PyMem_Free(temp_buff) + else: + if not encoding: + data = x + dsz = len(x) + else: + encoded = x.encode(encoding) + dsz = len(encoded) + data = encoded + sz = dsz + while True: + b = sz & 0x7f + sz >>= 7 + if sz != 0: + b |= 0x80 + temp_buff[buff_loc] = b + buff_loc += 1 + if buff_loc == buff_size: + extend_byte_array(dest, array_size, mv, buff_loc) + array_size += buff_loc + buff_loc = 0 + if sz == 0: + break + if dsz + buff_loc >= buff_size: + if buff_loc > 0: # Write what we have so far + extend_byte_array(dest, array_size, mv, buff_loc) + array_size += buff_loc + buff_loc = 0 + if (dsz << 4) > buff_size: # resize our buffer for very large strings + PyMem_Free( temp_buff) + mv.release() + buff_size = dsz << 6 + temp_buff = PyMem_Malloc( buff_size) + mv = PyMemoryView_FromMemory(temp_buff, buff_size, PyBUF_READ) + memcpy(temp_buff + buff_loc, data, dsz) + buff_loc += dsz + if buff_loc > 0: + extend_byte_array(dest, array_size, mv, buff_loc) + finally: + mv.release() + PyMem_Free(temp_buff) diff --git a/tests/integration_tests/test_pandas.py b/tests/integration_tests/test_pandas.py index 325a4245..4d30fa17 100644 --- a/tests/integration_tests/test_pandas.py +++ b/tests/integration_tests/test_pandas.py @@ -73,7 +73,7 @@ def test_pandas_csv(test_client: Client, table_context: Callable): key2,6666,,string2,, """ csv_file = StringIO(csv) - df = pd.read_csv(csv_file, parse_dates=['dt', 'd'], date_parser=pd.Timestamp) + df = pd.read_csv(csv_file, parse_dates=['dt', 'd']) df = df[['num', 'flt']].astype('Float32') source_df = df.copy() with table_context('test_pandas_csv', null_ds_columns, null_ds_types): @@ -258,3 +258,16 @@ def test_pandas_row_df(test_client: Client, table_context:Callable): assert result_df.iloc[0]['dt'] == pd.Timestamp(2023, 10, 15, 14, 50, 2, 4038) assert len(result_df) == 1 assert source_df.equals(df) + + +def test_pandas_null_strings(test_client: Client, table_context:Callable): + with table_context('test_pandas_null_strings', ['id String', 'test_col LowCardinality(String)']): + row = {'id': 'id', 'test_col': None} + df = pd.DataFrame([row]) + assert df['test_col'].isnull().values.all() + with pytest.raises(DataError): + test_client.insert_df('test_pandas_null_strings', df) + row2 = {'id': 'id2', 'test_col': 'val'} + df = pd.DataFrame([row, row2]) + with pytest.raises(DataError): + test_client.insert_df('test_pandas_null_strings', df) diff --git a/tests/integration_tests/test_sqlalchemy/test_basics.py b/tests/integration_tests/test_sqlalchemy/test_basics.py index 09d9369b..5a6dec79 100644 --- a/tests/integration_tests/test_sqlalchemy/test_basics.py +++ b/tests/integration_tests/test_sqlalchemy/test_basics.py @@ -44,16 +44,16 @@ def test_cursor(test_engine: Engine): def test_execute(test_engine: Engine): common.set_setting('invalid_setting_action', 'drop') - connection = test_engine.connect() - sql = test_query - if not connection.connection.connection.client.min_version('21'): - sql = test_query_ver19 - rows = list(row for row in connection.execute(sql)) - assert len(rows) == 2 + with test_engine.begin() as conn: + sql = test_query + if not conn.connection.connection.client.min_version('21'): + sql = test_query_ver19 + rows = list(row for row in conn.execute(sql)) + assert len(rows) == 2 - rows = list(row for row in connection.execute('DROP TABLE IF EXISTS dummy_table')) - assert rows[0][0] == 0 + rows = list(row for row in conn.execute('DROP TABLE IF EXISTS dummy_table')) + assert rows[0][0] == 0 - rows = list(row for row in connection.execute('describe TABLE system.columns')) - assert len(rows) > 5 + rows = list(row for row in conn.execute('describe TABLE system.columns')) + assert len(rows) > 5 From cea89f1cd18f49f32ca69a750578e1255f08121e Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 7 Nov 2023 15:02:50 -0700 Subject: [PATCH 2/2] Handle spaces in Tuple element names. --- CHANGELOG.md | 8 ++++++-- clickhouse_connect/datatypes/container.py | 4 ++-- clickhouse_connect/driver/parser.py | 12 ++++++------ clickhouse_connect/driver/query.py | 4 ++-- clickhouse_connect/tools/testing.py | 6 +++--- examples/clear_test_databases.py | 22 ++++++++++++++++++++++ tests/integration_tests/test_inserts.py | 11 +++++++++++ tests/integration_tests/test_native.py | 17 ++++++++++------- tests/unit_tests/test_chtypes.py | 4 ++-- 9 files changed, 64 insertions(+), 24 deletions(-) create mode 100644 examples/clear_test_databases.py diff --git a/CHANGELOG.md b/CHANGELOG.md index fa8eb5c6..e419c87e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,11 +14,15 @@ In any case, this should not affect the basic usage of Superset with ClickHouse. your Superset installation, the ClickHouse datasource will be available with either the enhanced connection dialog or a standard SqlAlchemy DSN in the form of `clickhousedb://{username}:{password}@{host}:{port}`. -## 0.6.19, TBD -### Bug Fix +## 0.6.19, 2023-11-07 +### Bug Fixes - In some circumstances it was possible to insert a `None` value into a non-Nullable String column. As this could mask invalid input data, any attempt to insert None into a non-Nullable String or LowCardinality(String) will now throw a DataError +- Reading a named Tuple column where the Tuple element names contained spaces would fail. In particular this would +cause expected failures reading the experimental JSON column type with spaces in the keys. This has been fixed. Closes +https://github.com/ClickHouse/clickhouse-connect/issues/265. Note that handling spaces in column types is tricky and +fragile in several respects, so the best approach remains to use simple column names without spaces. ## 0.6.18, 2023-10-25 ### Bug Fixes diff --git a/clickhouse_connect/datatypes/container.py b/clickhouse_connect/datatypes/container.py index 0281867b..36e4c237 100644 --- a/clickhouse_connect/datatypes/container.py +++ b/clickhouse_connect/datatypes/container.py @@ -3,7 +3,7 @@ from typing import Sequence, Collection from clickhouse_connect.driver.insert import InsertContext -from clickhouse_connect.driver.query import QueryContext +from clickhouse_connect.driver.query import QueryContext, quote_identifier from clickhouse_connect.driver.types import ByteSource from clickhouse_connect.json_impl import any_to_json from clickhouse_connect.datatypes.base import ClickHouseType, TypeDef @@ -94,7 +94,7 @@ def __init__(self, type_def: TypeDef): self.element_names = type_def.keys self.element_types = [get_from_name(name) for name in type_def.values] if self.element_names: - self._name_suffix = f"({', '.join(k + ' ' + str(v) for k, v in zip(type_def.keys, type_def.values))})" + self._name_suffix = f"({', '.join(quote_identifier(k) + ' ' + str(v) for k, v in zip(type_def.keys, type_def.values))})" else: self._name_suffix = type_def.arg_str diff --git a/clickhouse_connect/driver/parser.py b/clickhouse_connect/driver/parser.py index a158e7f9..acdf9510 100644 --- a/clickhouse_connect/driver/parser.py +++ b/clickhouse_connect/driver/parser.py @@ -130,13 +130,13 @@ def parse_columns(expr: str): named = False level = 0 label = '' - in_str = False + quote = None while True: char = expr[pos] pos += 1 - if in_str: - if "'" == char: - in_str = False + if quote: + if char == quote: + quote = None elif char == '\\' and expr[pos] == "'" and expr[pos:pos + 4] != "' = " and expr[pos:pos + 2] != "')": label += expr[pos] pos += 1 @@ -156,8 +156,8 @@ def parse_columns(expr: str): elif char == ')': columns.append(label) break - if char == "'" and (not label or 'Enum' in label): - in_str = True + if char in ("'", '`') and (not label or 'Enum' in label): + quote = char elif char == '(': level += 1 elif char == ')': diff --git a/clickhouse_connect/driver/query.py b/clickhouse_connect/driver/query.py index 0b5086ae..c4a48aaf 100644 --- a/clickhouse_connect/driver/query.py +++ b/clickhouse_connect/driver/query.py @@ -341,7 +341,7 @@ def close(self): BS = '\\' -must_escape = (BS, '\'') +must_escape = (BS, '\'', '`') def quote_identifier(identifier: str): @@ -349,7 +349,7 @@ def quote_identifier(identifier: str): if first_char in ('`', '"') and identifier[-1] == first_char: # Identifier is already quoted, assume that it's valid return identifier - return f'`{identifier}`' + return f'`{escape_str(identifier)}`' def finalize_query(query: str, parameters: Optional[Union[Sequence, Dict[str, Any]]], diff --git a/clickhouse_connect/tools/testing.py b/clickhouse_connect/tools/testing.py index f30b3f75..43517096 100644 --- a/clickhouse_connect/tools/testing.py +++ b/clickhouse_connect/tools/testing.py @@ -1,7 +1,7 @@ from typing import Sequence, Optional, Union, Dict, Any from clickhouse_connect.driver import Client -from clickhouse_connect.driver.query import format_query_value +from clickhouse_connect.driver.query import format_query_value, quote_identifier class TableContext: @@ -29,14 +29,14 @@ def __init__(self, client: Client, self.column_names = columns self.column_types = column_types self.engine = engine - self.order_by = self.column_names[0] if order_by is None else order_by + self.order_by = quote_identifier(self.column_names[0]) if order_by is None else order_by def __enter__(self): if self.client.min_version('19'): self.client.command(f'DROP TABLE IF EXISTS {self.table}') else: self.client.command(f'DROP TABLE IF EXISTS {self.table} SYNC') - col_defs = ','.join(f'{name} {col_type}' for name, col_type in zip(self.column_names, self.column_types)) + col_defs = ','.join(f'{quote_identifier(name)} {col_type}' for name, col_type in zip(self.column_names, self.column_types)) create_cmd = f'CREATE TABLE {self.table} ({col_defs}) ENGINE {self.engine} ORDER BY {self.order_by}' if self.settings: create_cmd += ' SETTINGS ' diff --git a/examples/clear_test_databases.py b/examples/clear_test_databases.py new file mode 100644 index 00000000..ddbb9410 --- /dev/null +++ b/examples/clear_test_databases.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 -u + +import os + +import clickhouse_connect + + +def main(): + host = os.getenv('CLICKHOUSE_CONNECT_TEST_HOST', 'localhost') + port = int(os.getenv('CLICKHOUSE_CONNECT_TEST_PORT', '8123')) + password = os.getenv('CLICKHOUSE_CONNECT_TEST_PASSWORD', '') + client = clickhouse_connect.get_client(host=host, port=port, password=password) + database_result = client.query("SELECT name FROM system.databases WHERE name ilike '%test%'").result_rows + for database_row in database_result: + database:str = database_row[0] + if database.startswith('dbt_clickhouse') or database.startswith('clickhouse_connect'): + print(f'DROPPING DATABASE `{database}`') + client.command(f'DROP DATABASE IF EXISTS {database}') + + +if __name__ == '__main__': + main() diff --git a/tests/integration_tests/test_inserts.py b/tests/integration_tests/test_inserts.py index 6a59bd32..d46ef91a 100644 --- a/tests/integration_tests/test_inserts.py +++ b/tests/integration_tests/test_inserts.py @@ -52,3 +52,14 @@ def test_low_card_dictionary_size(test_client: Client, table_context: Callable): data = [[x, str(x)] for x in range(30000)] test_client.insert('test_low_card_dict', data) assert 30000 == test_client.command('SELECT count() FROM test_low_card_dict') + + +def test_column_names_spaces(test_client: Client, table_context: Callable): + with table_context('test_column_spaces', + columns=['key 1', 'value 1'], + column_types=['Int32', 'String']): + data = [[1, 'str 1'], [2, 'str 2']] + test_client.insert('test_column_spaces', data) + result = test_client.query('SELECT * FROM test_column_spaces').result_rows + assert result[0][0] == 1 + assert result[1][1] == 'str 2' diff --git a/tests/integration_tests/test_native.py b/tests/integration_tests/test_native.py index 81a0f73d..af506619 100644 --- a/tests/integration_tests/test_native.py +++ b/tests/integration_tests/test_native.py @@ -57,7 +57,7 @@ def test_json(test_client: Client, table_context: Callable): jv1 = {'key1': 337, 'value.2': 'vvvv', 'HKD@spéçiäl': 'Special K', 'blank': 'not_really_blank'} jv3 = {'key3': 752, 'value.2': 'v2_rules', 'blank': None} njv2 = {'nk1': -302, 'nk2': {'sub1': 372, 'sub2': 'a string'}} - njv3 = {'nk1': 5832.44, 'nk2': {'sub1': 47788382, 'sub2':'sub2val', 'sub3': 'sub3str'}} + njv3 = {'nk1': 5832.44, 'nk2': {'sub1': 47788382, 'sub2':'sub2val', 'sub3': 'sub3str', 'space key': 'spacey'}} test_client.insert('native_json_test', [ [5, jv1, -44, None], [20, None, 5200, njv2], @@ -67,15 +67,18 @@ def test_json(test_client: Client, table_context: Callable): json1 = result.result_set[0][1] assert json1['HKD@spéçiäl'] == 'Special K' assert json1['key3'] == 0 + json2 = result.result_set[1][3] + assert json2['nk1'] == -302.0 + assert json2['nk2']['sub2'] == 'a string' + assert json2['nk2']['sub3'] is None json3 = result.result_set[2][1] assert json3['value.2'] == 'v2_rules' assert json3['blank'] == '' assert json3['key1'] == 0 assert json3['key3'] == 752 - json2 = result.result_set[1][3] - assert json2['nk1'] == -302.0 - assert json2['nk2']['sub2'] == 'a string' - assert json2['nk2']['sub3'] is None + null_json3 = result.result_set[2][3] + assert null_json3['nk2']['space key'] == 'spacey' + set_write_format('JSON', 'string') test_client.insert('native_json_test', [[999, '{"key4": 283, "value.2": "str_value"}', 77, '{"nk1":53}']]) result = test_client.query('SELECT value.key4, null_value.nk1 FROM native_json_test ORDER BY key') @@ -148,13 +151,13 @@ def test_read_formats(test_client: Client, test_table_engine: str): def test_tuple_inserts(test_client: Client, table_context: Callable): - with table_context('insert_tuple_test', ['key Int32', 'named Tuple(fl Float64, ns Nullable(String))', + with table_context('insert_tuple_test', ['key Int32', 'named Tuple(fl Float64, `ns space` Nullable(String))', 'unnamed Tuple(Float64, Nullable(String))']): data = [[1, (3.55, 'str1'), (555, None)], [2, (-43.2, None), (0, 'str2')]] result = test_client.insert('insert_tuple_test', data) assert 2 == result.written_rows - data = [[1, {'fl': 3.55, 'ns': 'str1'}, (555, None)], [2, {'fl': -43.2}, (0, 'str2')]] + data = [[1, {'fl': 3.55, 'ns space': 'str1'}, (555, None)], [2, {'fl': -43.2}, (0, 'str2')]] result = test_client.insert('insert_tuple_test', data) assert 2 == result.written_rows diff --git a/tests/unit_tests/test_chtypes.py b/tests/unit_tests/test_chtypes.py index 140cdb1d..d577b090 100644 --- a/tests/unit_tests/test_chtypes.py +++ b/tests/unit_tests/test_chtypes.py @@ -41,5 +41,5 @@ def test_nested_parse(): def test_named_tuple(): tuple_type = gfn('Tuple(Int64, String)') assert tuple_type.name == 'Tuple(Int64, String)' - tuple_type = gfn('Tuple(key Int64, value String)') - assert tuple_type.name == 'Tuple(key Int64, value String)' + tuple_type = gfn('Tuple(`key` Int64, `value` String)') + assert tuple_type.name == 'Tuple(`key` Int64, `value` String)'