Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update scalardb-analytics-spark-sample to support 3.14 #75

Merged
merged 3 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scalardb-analytics-spark-sample/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.scala_history
1 change: 0 additions & 1 deletion scalardb-analytics-spark-sample/cert.pem

This file was deleted.

118 changes: 60 additions & 58 deletions scalardb-analytics-spark-sample/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,50 +1,50 @@
services:
spark-shell:
spark-sql:
build:
context: ./docker
dockerfile: Dockerfile.spark
volumes:
- ./scalardb.properties:/etc/scalardb.properties
- ./cert.pem:/etc/cert.pem
- .scala_history_jline3:/root/.scala_history_jline3
- ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
- .scala_history:/root/.scala_history
networks:
- scalar-network
profiles:
- dev
depends_on:
- backend-postgres
- backend-cassandra
- backend-dynamodb
- scalardb-cassandra
- scalardb-mysql
- postgres
command:
- "/opt/spark/bin/spark-shell"
- "/opt/spark/bin/spark-sql"
- "--packages"
- "com.scalar-labs:scalardb-analytics-spark-3.5_2.12:3.12.0"
- "com.scalar-labs:scalardb-analytics-spark-all-3.5_2.12:3.14.0"

backend-postgres:
image: postgres:15.1
ports:
- "5432"
sample-data-loader:
build:
context: sample-data-loader
dockerfile: Dockerfile
volumes:
- backend-postgres-data:/var/lib/postgresql/data
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres
- POSTGRES_DB=test
- ./scalardb.properties:/etc/scalardb.properties
- ./schema.json:/etc/schema.json
- ./data:/data
working_dir: /sample-data-loader
networks:
- scalar-network
healthcheck:
test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"]
interval: 1s
timeout: 1s
retries: 10
start_period: 1s
profiles:
- dev
depends_on:
- scalardb-cassandra
- scalardb-mysql
- postgres
command: ["java", "-jar", "/app.jar"]

backend-cassandra:
scalardb-cassandra:
image: cassandra:3.11
ports:
- "9042"
- 9042
volumes:
- backend-cassandra-data:/var/lib/cassandra
- scalardb-cassandra-data:/var/lib/cassandra
environment:
- CASSANDRA_DC=dc1
- CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch
Expand All @@ -55,50 +55,52 @@ services:
interval: 1s
timeout: 1s
retries: 10
start_period: 5s
start_period: 10s

backend-dynamodb:
image: amazon/dynamodb-local:1.21.0
scalardb-mysql:
image: mysql:8.0
ports:
- "8000"
command:
[
"-jar",
"DynamoDBLocal.jar",
"-sharedDb",
"-dbPath",
"/home/dynamodblocal",
"-optimizeDbBeforeStartup",
]
- 3306
volumes:
- backend-dynamodb-data:/home/dynamodblocal
- scalardb-mysql-data:/var/lib/mysql
environment:
- MYSQL_ROOT_PASSWORD=mysql
- MYSQL_DATABASE=sampledb
networks:
- scalar-network
healthcheck:
test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root"]
interval: 1s
timeout: 1s
retries: 10
start_period: 5s

sample-data-loader:
build:
context: sample-data-loader
dockerfile: Dockerfile
postgres:
image: postgres:15.1
ports:
- 5432
volumes:
- ./scalardb.properties:/etc/scalardb.properties
- ./schema.json:/etc/schema.json
- ./data:/data
working_dir: /sample-data-loader
- postgres-data:/var/lib/postgresql/data
- ./data/customer.csv:/opt/customer.csv
- ./sql/postgres_copy.sql:/docker-entrypoint-initdb.d/postgres_copy.sql
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres
- POSTGRES_DB=sampledb
networks:
- scalar-network
profiles:
- dev
depends_on:
- backend-postgres
- backend-cassandra
- backend-dynamodb
command: ["java", "-jar", "/app.jar"]
healthcheck:
test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"]
interval: 1s
timeout: 1s
retries: 10
start_period: 5s

volumes:
analytics-data: {}
backend-postgres-data: {}
backend-cassandra-data: {}
backend-dynamodb-data: {}
scalardb-cassandra-data: {}
scalardb-mysql-data: {}
postgres-data: {}

networks:
scalar-network: {}
2 changes: 1 addition & 1 deletion scalardb-analytics-spark-sample/docker/Dockerfile.spark
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM eclipse-temurin:17-jre-jammy

WORKDIR /work

ENV SPARK_VERSION 3.5.1
ENV SPARK_VERSION 3.5.3

RUN apt-get update && \
apt-get install -y --no-install-recommends \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
plugins {
application
id("com.github.johnrengelman.shadow") version "7.1.2"
id("com.gradleup.shadow") version "8.3.5"
id("com.diffplug.spotless") version "6.24.0"
}

Expand All @@ -9,8 +9,8 @@ repositories {
}

dependencies {
implementation("com.scalar-labs:scalardb:3.12.1")
implementation("com.scalar-labs:scalardb-schema-loader:3.12.1")
implementation("com.scalar-labs:scalardb:3.14.0")
implementation("com.scalar-labs:scalardb-schema-loader:3.14.0")
implementation("org.apache.commons:commons-csv:1.10.0")

implementation("io.netty:netty-transport-native-epoll:4.1.99.Final:linux-x86_64")
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
Expand Down
7 changes: 5 additions & 2 deletions scalardb-analytics-spark-sample/sample-data-loader/gradlew
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
#

##############################################################################
#
Expand Down Expand Up @@ -55,7 +57,7 @@
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
Expand Down Expand Up @@ -84,7 +86,8 @@ done
# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s
' "$PWD" ) || exit

# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
Expand Down
22 changes: 12 additions & 10 deletions scalardb-analytics-spark-sample/sample-data-loader/gradlew.bat
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@rem SPDX-License-Identifier: Apache-2.0
@rem

@if "%DEBUG%"=="" @echo off
@rem ##########################################################################
Expand Down Expand Up @@ -43,11 +45,11 @@ set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if %ERRORLEVEL% equ 0 goto execute

echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
echo. 1>&2
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
echo. 1>&2
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
echo location of your Java installation. 1>&2

goto fail

Expand All @@ -57,11 +59,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe

if exist "%JAVA_EXE%" goto execute

echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
echo. 1>&2
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
echo. 1>&2
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
echo location of your Java installation. 1>&2

goto fail

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.scalar.db.api.DistributedTransaction;
import com.scalar.db.api.DistributedTransactionManager;
import com.scalar.db.api.Mutation;
import com.scalar.db.api.Put;
import com.scalar.db.exception.transaction.TransactionException;
import com.scalar.db.io.Key;
Expand All @@ -14,29 +15,18 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;

public class Loader implements AutoCloseable {
private static final String CUSTOMER_DATA = "/data/customer.csv";
private static final String ORDERS_DATA = "/data/orders.csv";
private static final String LINEITEM_DATA = "/data/lineitem.csv";
private static final String CONFIG_FILE_PATH = "/etc/scalardb.properties";
private static final String SCHEMA_FILE_PATH = "/etc/schema.json";

private static final String[] CUSTOMER_COLUMNS = {
"c_custkey",
"c_name",
"c_address",
"c_nationkey",
"c_phone",
"c_acctbal",
"c_mktsegment",
"c_comment"
};

private static final String[] ORDERS_COLUMNS = {
"o_orderkey",
"o_custkey",
Expand Down Expand Up @@ -82,8 +72,6 @@ public void close() {
public void load() throws TransactionException, IOException, SchemaLoaderException {
loadSchema();

loadData(this.manager, CUSTOMER_DATA, CUSTOMER_COLUMNS, this::buildPutCustomer);

loadData(this.manager, ORDERS_DATA, ORDERS_COLUMNS, this::buildPutOrders);

loadData(this.manager, LINEITEM_DATA, LINEITEM_COLUMNS, this::buildPutLineitem);
Expand All @@ -101,25 +89,9 @@ private void loadSchema() throws SchemaLoaderException {
SchemaLoader.load(configFilePath, schemaFilePath, options, createCoordinatorTables);
}

private Put buildPutCustomer(CSVRecord record) {
return Put.newBuilder()
.namespace("dynamons")
.table("customer")
.partitionKey(Key.ofInt("c_custkey", intCol(record, "c_custkey")))
.textValue("c_name", stringCol(record, "c_name"))
.textValue("c_address", stringCol(record, "c_address"))
.intValue("c_nationkey", intCol(record, "c_nationkey"))
.textValue("c_phone", stringCol(record, "c_phone"))
.doubleValue("c_acctbal", doubleCol(record, "c_acctbal"))
.textValue("c_mktsegment", stringCol(record, "c_mktsegment"))
.textValue("c_comment", stringCol(record, "c_comment"))
.enableImplicitPreRead()
.build();
}

private Put buildPutOrders(CSVRecord record) {
return Put.newBuilder()
.namespace("postgresns")
.namespace("mysqlns")
.table("orders")
.partitionKey(Key.ofInt("o_orderkey", intCol(record, "o_orderkey")))
.intValue("o_custkey", intCol(record, "o_custkey"))
Expand Down Expand Up @@ -175,7 +147,8 @@ private void loadData(
transaction = manager.start();
for (CSVRecord record : records) {
Put put = putFunction.apply(record);
transaction.put(put);
List<Mutation> mutations = List.of(put);
transaction.mutate(mutations);
}
transaction.commit();
} catch (TransactionException e) {
Expand Down
25 changes: 7 additions & 18 deletions scalardb-analytics-spark-sample/scalardb.properties
Original file line number Diff line number Diff line change
@@ -1,29 +1,18 @@
scalar.db.storage=multi-storage
scalar.db.multi_storage.storages=cassandra,postgres,dynamodb
scalar.db.multi_storage.storages=cassandra,mysql

scalar.db.multi_storage.storages.cassandra.storage=cassandra
scalar.db.multi_storage.storages.cassandra.contact_points=backend-cassandra
scalar.db.multi_storage.storages.cassandra.contact_points=scalardb-cassandra
scalar.db.multi_storage.storages.cassandra.contact_port=9042
scalar.db.multi_storage.storages.cassandra.username=cassandra
scalar.db.multi_storage.storages.cassandra.password=cassandra

scalar.db.multi_storage.storages.postgres.storage=jdbc
scalar.db.multi_storage.storages.postgres.contact_points=jdbc:postgresql://backend-postgres:5432/test
scalar.db.multi_storage.storages.postgres.username=postgres
scalar.db.multi_storage.storages.postgres.password=postgres
scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.min_idle=5
scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.max_idle=10
scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.max_total=25
scalar.db.multi_storage.storages.mysql.storage=jdbc
scalar.db.multi_storage.storages.mysql.contact_points=jdbc:mysql://scalardb-mysql:3306/sampledb
scalar.db.multi_storage.storages.mysql.username=root
scalar.db.multi_storage.storages.mysql.password=mysql

scalar.db.multi_storage.storages.dynamodb.contact_points=ap-northeast-1
scalar.db.multi_storage.storages.dynamodb.username=access_key_id
scalar.db.multi_storage.storages.dynamodb.password=secret_access_key
scalar.db.multi_storage.storages.dynamodb.storage=dynamo
scalar.db.multi_storage.storages.dynamodb.dynamo.endpoint_override=http://backend-dynamodb:8000
scalar.db.multi_storage.storages.dynamodb.dynamo.table_metadata.namespace=table_metadata
scalar.db.multi_storage.storages.dynamodb.dynamo.namespace.prefix=scalar_

scalar.db.multi_storage.namespace_mapping=cassandrans:cassandra,postgresns:postgres,dynamons:dynamodb
scalar.db.multi_storage.namespace_mapping=cassandrans:cassandra,mysqlns:mysql

scalar.db.multi_storage.default_storage=cassandra

Expand Down
Loading