Skip to content

Commit

Permalink
[Importer] Update Iceberg table syntax for Impala (#3887)
Browse files Browse the repository at this point in the history
  • Loading branch information
agl29 authored Nov 26, 2024
1 parent 7489ce0 commit a8a8585
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 4 deletions.
1 change: 1 addition & 0 deletions desktop/libs/indexer/src/indexer/indexers/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ def create_table_from_a_file(self, source, destination, start_time=-1, file_enco
extra_create_properties = "STORED BY ICEBERG\n"
elif source_type == 'impala':
file_format = 'ICEBERG'
external = False
extra_create_properties += 'STORED AS %(file_format)s' % {'file_format': file_format}
if is_transactional:
extra_create_properties += "\nTBLPROPERTIES('transactional'='true', 'transactional_properties'='%s')" % \
Expand Down
102 changes: 98 additions & 4 deletions desktop/libs/indexer/src/indexer/indexers/sql_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1279,7 +1279,7 @@ def test_generate_create_avro_table():


@pytest.mark.django_db
def test_generate_create_iceberg_table():
def test_generate_create_hive_iceberg_table():
source = json.loads(
'''{"sourceType": "hive", "name":"","sample":[["Bank Of America","3000000.0","US","Miami","37.6801986694",'''
'''"-121.92150116"],["Citi Bank","2800000.0","US","Richmond","37.5242004395","-77.4932022095"],["Deutsche Bank","2600000.0","US",'''
Expand Down Expand Up @@ -1336,7 +1336,7 @@ def test_generate_create_iceberg_table():
'''"text","name":"Text"},{"value":"parquet","name":"Parquet"},{"value":"kudu","name":"Kudu"},{"value":"csv","name":"Csv"},'''
'''{"value":"avro","name":"Avro"},{"value":"json","name":"Json"},{"value":"regexp","name":"Regexp"},{"value":"orc",'''
'''"name":"ORC"}],"partitionColumns":[],"kuduPartitionColumns":[],"primaryKeys":[],"primaryKeyObjects":[],"importData":true,'''
'''"isIceberg":true,"useCopy":false,"useDefaultLocation":true,"nonDefaultLocation":"/user/hue/data/query-hive-360.csv",'''
'''"isIceberg":true,"useCopy":false,"useDefaultLocation":false,"nonDefaultLocation":"/user/hue/data/query-hive-360.csv",'''
'''"hasHeader":true,"useCustomDelimiters":false,"customFieldDelimiter":",","customCollectionDelimiter":"\\\\002",'''
'''"customMapDelimiter":"\\\\003","customRegexp":""}'''
)
Expand All @@ -1346,7 +1346,6 @@ def test_generate_create_iceberg_table():

sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(source, destination).get_str()

print(sql)
assert '''USE default;''' in sql, sql

statement = '''CREATE EXTERNAL TABLE IF NOT EXISTS `default`.`hue__tmp_parquet_table`
Expand All @@ -1366,7 +1365,7 @@ def test_generate_create_iceberg_table():
assert statement in sql, sql

assert (
'''CREATE TABLE `default`.`parquet_table`
'''CREATE EXTERNAL TABLE `default`.`parquet_table`
STORED BY ICEBERG
STORED AS parquet
AS SELECT *
Expand All @@ -1378,6 +1377,101 @@ def test_generate_create_iceberg_table():
assert '''DROP TABLE IF EXISTS `default`.`hue__tmp_parquet_table`;''' in sql, sql


@pytest.mark.django_db
def test_generate_create_impala_iceberg_table():
source = json.loads(
'''{"sourceType": "impala", "name":"","sample":[["Bank Of America","3000000.0","US","Miami","37.6801986694",'''
'''"-121.92150116"],["Citi Bank","2800000.0","US","Richmond","37.5242004395","-77.4932022095"],["Deutsche Bank","2600000.0","US",'''
'''"Corpus Christi","40.7807998657","-73.9772033691"],["Thomson Reuters","2400000.0","US","Albany","35.7976989746",'''
'''"-78.6252975464"],'''
'''["OpenX","2200000.0","US","Des Moines","40.5411987305","-119.586898804"]],"sampleCols":[{"operations":[],"comment":"",'''
'''"nested":[],'''
'''"name":"acct_client","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,'''
'''"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],'''
'''"comment":"","nested":[],"name":"tran_amount","level":0,"keyType":"string","required":false,"precision":10,"keep":true,'''
'''"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double",'''
'''"showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_country_cd","level":0,"keyType":'''
'''"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,'''
'''"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city",'''
'''"level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"",'''
'''"multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],'''
'''"name":"vrfcn_city_lat","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,'''
'''"length":100,'''
'''"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,"scale":0},{"operations":[],'''
'''"comment":"","nested":[],"name":"vrfcn_city_lon","level":0,"keyType":"string","required":false,"precision":10,"keep":true,'''
'''"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":false,'''
'''"scale":0}],"inputFormat":"file","inputFormatsAll":[{"value":"file","name":"File"},{"value":"manual","name":"Manually"},'''
'''{"value":"query","name":"SQL Query"},{"value":"table","name":"Table"}],"inputFormatsManual":[{"value":"manual","name":'''
'''"Manually"}],"inputFormats":[{"value":"file","name":"File"},{"value":"manual","name":"Manually"},{"value":"query","name":'''
'''"SQL Query"},{"value":"table","name":"Table"}],"path":"/user/hue/data/query-impala-360.csv","isObjectStore":false,"table":"",'''
'''"tableName":"","databaseName":"default","apiHelperType":"impala","query":"","draggedQuery":"","format":{"type":"csv",'''
'''"fieldSeparator":",","recordSeparator":"\\n","quoteChar":"\\"","hasHeader":true,"status":0},"show":true,"defaultName":'''
'''"default.query-impala-360"}'''
)
destination = json.loads(
'''{"isTransactional": false, "isInsertOnly": false, "sourceType": "impala", "name":"default.parquet_table"'''
''',"apiHelperType":"impala","description":"","outputFormat":"table","outputFormatsList":[{"name":"Table","value":"table"},'''
'''{"name":"Solr index","value":"index"},{"name":"File","value":"file"},{"name":"Database","value":"database"}],'''
'''"outputFormats":[{"name":"Table","value":"table"},{"name":"Solr index","value":"index"}],"columns":[{"operations":[],'''
'''"comment":"","nested":[],"name":"acct_client","level":0,"keyType":"string","required":false,"precision":10,"keep":true,'''
'''"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":'''
'''false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_amount","level":0,"keyType":"string","required":false,'''
'''"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":'''
'''"double","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"tran_country_cd","level":0,'''
'''"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"",'''
'''"multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],"comment":"","nested":'''
'''[],"name":"vrfcn_city","level":0,"keyType":"string","required":false,"precision":10,"keep":true,"isPartition":false,"length":'''
'''100,"partitionValue":"","multiValued":false,"unique":false,"type":"string","showProperties":false,"scale":0},{"operations":[],'''
'''"comment":"","nested":[],"name":"vrfcn_city_lat","level":0,"keyType":"string","required":false,"precision":10,"keep":true,'''
'''"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,"type":"double","showProperties":'''
'''false,"scale":0},{"operations":[],"comment":"","nested":[],"name":"vrfcn_city_lon","level":0,"keyType":"string","required":'''
'''false,"precision":10,"keep":true,"isPartition":false,"length":100,"partitionValue":"","multiValued":false,"unique":false,'''
'''"type":"double","showProperties":false,"scale":0}],"bulkColumnNames":"acct_client,tran_amount,tran_country_cd,vrfcn_city,'''
'''vrfcn_city_lat,vrfcn_city_lon","showProperties":false,"isTargetExisting":false,"isTargetChecking":false,"existingTargetUrl":'''
'''"","tableName":"parquet_table","databaseName":"default","tableFormat":"parquet","KUDU_DEFAULT_RANGE_PARTITION_COLUMN":'''
'''{"values":[{"value":""}],"name":"VALUES","lower_val":0,"include_lower_val":"<=","upper_val":1,"include_upper_val":"<="},'''
'''"KUDU_DEFAULT_PARTITION_COLUMN":{"columns":[],"range_partitions":[{"values":[{"value":""}],"name":"VALUES","lower_val":0,'''
'''"include_lower_val":"<=","upper_val":1,"include_upper_val":"<="}],"name":"HASH","int_val":16},"tableFormats":[{"value":'''
'''"text","name":"Text"},{"value":"parquet","name":"Parquet"},{"value":"kudu","name":"Kudu"},{"value":"csv","name":"Csv"},'''
'''{"value":"avro","name":"Avro"},{"value":"json","name":"Json"},{"value":"regexp","name":"Regexp"},{"value":"orc",'''
'''"name":"ORC"}],"partitionColumns":[],"kuduPartitionColumns":[],"primaryKeys":[],"primaryKeyObjects":[],"importData":true,'''
'''"isIceberg":true,"useCopy":false,"useDefaultLocation":false,"nonDefaultLocation":"/user/hue/data/query-impala-360.csv",'''
'''"hasHeader":true,"useCustomDelimiters":false,"customFieldDelimiter":",","customCollectionDelimiter":"\\\\002",'''
'''"customMapDelimiter":"\\\\003","customRegexp":""}'''
)

path = {'isDir': False, 'split': ('/user/hue/data', 'query-impala-360.csv'), 'listdir': ['/user/hue/data']}
request = MockRequest(fs=MockFs(path=path))

sql = SQLIndexer(user=request.user, fs=request.fs).create_table_from_a_file(source, destination).get_str()

assert '''USE default;''' in sql, sql

statement = '''CREATE EXTERNAL TABLE IF NOT EXISTS `default`.`hue__tmp_parquet_table`
(
`acct_client` string ,
`tran_amount` double ,
`tran_country_cd` string ,
`vrfcn_city` string ,
`vrfcn_city_lat` double ,
`vrfcn_city_lon` double ) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TextFile LOCATION '/user/hue/data'
TBLPROPERTIES('skip.header.line.count'='1', 'transactional'='false')
;'''
assert statement in sql, sql

assert (
'''CREATE TABLE `default`.`parquet_table`
STORED AS ICEBERG
AS SELECT *
FROM `default`.`hue__tmp_parquet_table`;'''
in sql
), sql

assert '''DROP TABLE IF EXISTS `default`.`hue__tmp_parquet_table`;''' in sql, sql


@pytest.mark.django_db
def test_generate_create_orc_table_transactional():
source = json.loads(
Expand Down

0 comments on commit a8a8585

Please sign in to comment.