From 5bcd931231e66ef77db25c7ba57d602281b7ce02 Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 20 Sep 2023 00:37:31 +0300 Subject: [PATCH 01/13] Create benchmark.sh --- spark/benchmark.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 spark/benchmark.sh diff --git a/spark/benchmark.sh b/spark/benchmark.sh new file mode 100644 index 000000000..897263b71 --- /dev/null +++ b/spark/benchmark.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +sudo apt-get update +sudo apt-get -y install openjdk-8-jdk-headless + +# For Spark3.0.1 installation: +# wget --continue https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz +# tar -xzf spark-3.0.1-bin-hadoop2.7.tgz +# mv spark-3.0.1-bin-hadoop2.7 spark + +wget --continue 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' +#gzip -d hits.tsv.gz +chmod 777 ~ hits.tsv +$HADOOP_HOME/bin/hdfs dfs -put hits.tsv / + +$SPARK_HOME/bin/spark-shell --master local -i ClickBenchRunner.scala > log.txt From 358473c1cd56c0d52c590173badb0b8c9610ebf2 Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 20 Sep 2023 00:38:59 +0300 Subject: [PATCH 02/13] Create .gitkeep --- spark/.gitkeep | 1 + 1 file changed, 1 insertion(+) create mode 100644 spark/.gitkeep diff --git a/spark/.gitkeep b/spark/.gitkeep new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/spark/.gitkeep @@ -0,0 +1 @@ + From e70c62075291d6e50d7c4191cc561a712afcf184 Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 20 Sep 2023 00:42:32 +0300 Subject: [PATCH 03/13] Create ClickBenchRunner.scala --- spark/ClickBenchRunner.scala | 146 +++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 spark/ClickBenchRunner.scala diff --git a/spark/ClickBenchRunner.scala b/spark/ClickBenchRunner.scala new file mode 100644 index 000000000..8040a0436 --- /dev/null +++ b/spark/ClickBenchRunner.scala @@ -0,0 +1,146 @@ +import org.apache.spark.sql.{SparkSession, SQLContext} +import java.sql.Statement +import java.io.{FileOutputStream, File} +import scala.io.Source +import scala.io.Codec +import org.apache.spark.sql.types._ + +val spark = SparkSession.builder.enableHiveSupport().getOrCreate() +spark.sql("DROP TABLE IF EXISTS hits") +val logFile = "log.txt" +val fileOutputStream = new FileOutputStream(logFile) +val fileStream = new java.io.PrintStream(fileOutputStream, true, Codec.UTF8.toString) +System.setOut(fileStream) + +val schema = StructType( + List( + StructField("WatchID", LongType, nullable = false), + StructField("JavaEnable", ShortType, nullable = false), + StructField("Title", StringType, nullable = false), + StructField("GoodEvent", ShortType, nullable = false), + StructField("EventTime", TimestampType, nullable = false), + StructField("EventDate", DateType, nullable = false), + StructField("CounterID", IntegerType, nullable = false), + StructField("ClientIP", IntegerType, nullable = false), + StructField("RegionID", IntegerType, nullable = false), + StructField("UserID", LongType, nullable = false), + StructField("CounterClass", ShortType, nullable = false), + StructField("OS", ShortType, nullable = false), + StructField("UserAgent", ShortType, nullable = false), + StructField("URL", StringType, nullable = false), + StructField("Referer", StringType, nullable = false), + StructField("IsRefresh", ShortType, nullable = false), + StructField("RefererCategoryID", ShortType, nullable = false), + StructField("RefererRegionID", IntegerType, nullable = false), + StructField("URLCategoryID", ShortType, nullable = false), + StructField("URLRegionID", IntegerType, nullable = false), + StructField("ResolutionWidth", ShortType, nullable = false), + StructField("ResolutionHeight", ShortType, nullable = false), + StructField("ResolutionDepth", ShortType, nullable = false), + StructField("FlashMajor", ShortType, nullable = false), + StructField("FlashMinor", ShortType, nullable = false), + StructField("FlashMinor2", StringType, nullable = false), + StructField("NetMajor", ShortType, nullable = false), + StructField("NetMinor", ShortType, nullable = false), + StructField("UserAgentMajor", ShortType, nullable = false), + StructField("UserAgentMinor", StringType, nullable = false), + StructField("CookieEnable", ShortType, nullable = false), + StructField("JavascriptEnable", ShortType, nullable = false), + StructField("IsMobile", ShortType, nullable = false), + StructField("MobilePhone", ShortType, nullable = false), + StructField("MobilePhoneModel", StringType, nullable = false), + StructField("Params", StringType, nullable = false), + StructField("IPNetworkID", IntegerType, nullable = false), + StructField("TraficSourceID", ShortType, nullable = false), + StructField("SearchEngineID", ShortType, nullable = false), + StructField("SearchPhrase", StringType, nullable = false), + StructField("AdvEngineID", ShortType, nullable = false), + StructField("IsArtifical", ShortType, nullable = false), + StructField("WindowClientWidth", ShortType, nullable = false), + StructField("WindowClientHeight", ShortType, nullable = false), + StructField("ClientTimeZone", ShortType, nullable = false), + StructField("ClientEventTime", TimestampType, nullable = false), + StructField("SilverlightVersion1", ShortType, nullable = false), + StructField("SilverlightVersion2", ShortType, nullable = false), + StructField("SilverlightVersion3", IntegerType, nullable = false), + StructField("SilverlightVersion4", ShortType, nullable = false), + StructField("PageCharset", StringType, nullable = false), + StructField("CodeVersion", IntegerType, nullable = false), + StructField("IsLink", ShortType, nullable = false), + StructField("IsDownload", ShortType, nullable = false), + StructField("IsNotBounce", ShortType, nullable = false), + StructField("FUniqID", LongType, nullable = false), + StructField("OriginalURL", StringType, nullable = false), + StructField("HID", IntegerType, nullable = false), + StructField("IsOldCounter", ShortType, nullable = false), + StructField("IsEvent", ShortType, nullable = false), + StructField("IsParameter", ShortType, nullable = false), + StructField("DontCountHits", ShortType, nullable = false), + StructField("WithHash", ShortType, nullable = false), + StructField("HitColor", StringType, nullable = false), + StructField("LocalEventTime", TimestampType, nullable = false), + StructField("Age", ShortType, nullable = false), + StructField("Sex", ShortType, nullable = false), + StructField("Income", ShortType, nullable = false), + StructField("Interests", ShortType, nullable = false), + StructField("Robotness", ShortType, nullable = false), + StructField("RemoteIP", IntegerType, nullable = false), + StructField("WindowName", IntegerType, nullable = false), + StructField("OpenerName", IntegerType, nullable = false), + StructField("HistoryLength", ShortType, nullable = false), + StructField("BrowserLanguage", StringType, nullable = false), + StructField("BrowserCountry", StringType, nullable = false), + StructField("SocialNetwork", StringType, nullable = false), + StructField("SocialAction", StringType, nullable = false), + StructField("HTTPError", ShortType, nullable = false), + StructField("SendTiming", IntegerType, nullable = false), + StructField("DNSTiming", IntegerType, nullable = false), + StructField("ConnectTiming", IntegerType, nullable = false), + StructField("ResponseStartTiming", IntegerType, nullable = false), + StructField("ResponseEndTiming", IntegerType, nullable = false), + StructField("FetchTiming", IntegerType, nullable = false), + StructField("SocialSourceNetworkID", ShortType, nullable = false), + StructField("SocialSourcePage", StringType, nullable = false), + StructField("ParamPrice", LongType, nullable = false), + StructField("ParamOrderID", StringType, nullable = false), + StructField("ParamCurrency", StringType, nullable = false), + StructField("ParamCurrencyID", ShortType, nullable = false), + StructField("OpenstatServiceName", StringType, nullable = false), + StructField("OpenstatCampaignID", StringType, nullable = false), + StructField("OpenstatAdID", StringType, nullable = false), + StructField("OpenstatSourceID", StringType, nullable = false), + StructField("UTMSource", StringType, nullable = false), + StructField("UTMMedium", StringType, nullable = false), + StructField("UTMCampaign", StringType, nullable = false), + StructField("UTMContent", StringType, nullable = false), + StructField("UTMTerm", StringType, nullable = false), + StructField("FromTag", StringType, nullable = false), + StructField("HasGCLID", ShortType, nullable = false), + StructField("RefererHash", LongType, nullable = false), + StructField("URLHash", LongType, nullable = false), + StructField("CLID", IntegerType, nullable = false)) +) + +val startTable = System.nanoTime() +val df = spark.read.schema(schema).option("header", "false").csv("/hits_neww.csv") +df.createOrReplaceTempView("hits") +val endTable = System.nanoTime() +val timeElapsedTable = (endTable - startTable) / 1000000 +println(s"Creating table time: $timeElapsedTable ms") + +val queries = Source.fromFile("queries.sql").getLines().toList +var itr: Int = 0 + queries.foreach(query => { + val start = System.nanoTime() + val result = spark.sql(query) + val end = System.nanoTime() + val timeElapsed = (end - start) / 1000000 + println(s"Query $itr | Time: $timeElapsed ms") + itr += 1 + }) + + +fileStream.close() +fileOutputStream.close() +spark.stop() +System.exit(0) From fbf405870be9456051ef8a3a13e71cd4391b5f65 Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 20 Sep 2023 00:43:07 +0300 Subject: [PATCH 04/13] Create queries.sql --- spark/queries.sql | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 spark/queries.sql diff --git a/spark/queries.sql b/spark/queries.sql new file mode 100644 index 000000000..e00090f26 --- /dev/null +++ b/spark/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10; From 70f531457fba4eb878a3e19dc98551f3c9516ce5 Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 20 Sep 2023 00:44:18 +0300 Subject: [PATCH 05/13] Update ClickBenchRunner.scala --- spark/ClickBenchRunner.scala | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/spark/ClickBenchRunner.scala b/spark/ClickBenchRunner.scala index 8040a0436..0d39a1dd0 100644 --- a/spark/ClickBenchRunner.scala +++ b/spark/ClickBenchRunner.scala @@ -1,16 +1,9 @@ import org.apache.spark.sql.{SparkSession, SQLContext} import java.sql.Statement -import java.io.{FileOutputStream, File} -import scala.io.Source -import scala.io.Codec import org.apache.spark.sql.types._ val spark = SparkSession.builder.enableHiveSupport().getOrCreate() spark.sql("DROP TABLE IF EXISTS hits") -val logFile = "log.txt" -val fileOutputStream = new FileOutputStream(logFile) -val fileStream = new java.io.PrintStream(fileOutputStream, true, Codec.UTF8.toString) -System.setOut(fileStream) val schema = StructType( List( @@ -122,7 +115,7 @@ val schema = StructType( ) val startTable = System.nanoTime() -val df = spark.read.schema(schema).option("header", "false").csv("/hits_neww.csv") +val df = spark.read.schema(schema).option("header", "false").option("sep", "\t").csv("/hits.tsv") df.createOrReplaceTempView("hits") val endTable = System.nanoTime() val timeElapsedTable = (endTable - startTable) / 1000000 @@ -139,8 +132,5 @@ var itr: Int = 0 itr += 1 }) - -fileStream.close() -fileOutputStream.close() spark.stop() System.exit(0) From 5486b1c690f026c3b2aea19d08398a2f487972cb Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Sun, 24 Sep 2023 16:26:48 +0300 Subject: [PATCH 06/13] Delete spark/.gitkeep --- spark/.gitkeep | 1 - 1 file changed, 1 deletion(-) delete mode 100644 spark/.gitkeep diff --git a/spark/.gitkeep b/spark/.gitkeep deleted file mode 100644 index 8b1378917..000000000 --- a/spark/.gitkeep +++ /dev/null @@ -1 +0,0 @@ - From 465593f0763cfef231b1956429a173da42700abc Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Mon, 5 Feb 2024 14:50:14 +0300 Subject: [PATCH 07/13] Added Spark and HDFS deployment process --- spark/benchmark.sh | 54 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/spark/benchmark.sh b/spark/benchmark.sh index 897263b71..c4edd1613 100644 --- a/spark/benchmark.sh +++ b/spark/benchmark.sh @@ -3,13 +3,57 @@ sudo apt-get update sudo apt-get -y install openjdk-8-jdk-headless -# For Spark3.0.1 installation: -# wget --continue https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz -# tar -xzf spark-3.0.1-bin-hadoop2.7.tgz -# mv spark-3.0.1-bin-hadoop2.7 spark +### If there is no HDFS and Spark on your system: + +# wget --continue https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz +# wget --continue https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz +# tar -xzf hadoop-3* +# tar -xzf spark-3* +# mv hadoop-3* +# mv spark-3* + +# echo "export HADOOP_HOME=/usr/local/hadoop" >> ~/.bashrc +# echo "export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin" >> ~/.bashrc +# echo "export SPARK_HOME=/usr/local/spark>" >> ~/.bashrc +# echo "export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin" >> ~/.bashrc + +### To configure HDFS: + +#cd /usr/local/hadoop/etc/hadoop +#cp core-site.xml core-site.xml.bak +#cp hdfs-site.xml hdfs-site.xml.bak +# echo " +# +# fs.defaultFS +# hdfs://localhost:9000 +# +# " > core-site.xml +# echo " +# +# dfs.replication +# 1 +# +# " > hdfs-site.xml + +### To configure Spark: + +# cd /usr/local/spark/conf +# cp spark-env.sh.template spark-env.sh +# echo "export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop" >> spark-env.sh +# echo "export SPARK_MASTER_HOST=localhost" >> spark-env.sh + +### To run Spark and HDFS: + +# start-master.sh +# start-slave.sh spark://localhost:7077 +# hdfs namenode -format +# start-dfs.sh + + +source ~/.bashrc wget --continue 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' -#gzip -d hits.tsv.gz +gzip -d hits.tsv.gz chmod 777 ~ hits.tsv $HADOOP_HOME/bin/hdfs dfs -put hits.tsv / From e5434c665863b923b2ef4702bd5881513823ce2e Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Mon, 5 Feb 2024 16:27:35 +0300 Subject: [PATCH 08/13] Minor fixes (paths, no log redirection...) --- spark/benchmark.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spark/benchmark.sh b/spark/benchmark.sh index c4edd1613..e2dba9500 100644 --- a/spark/benchmark.sh +++ b/spark/benchmark.sh @@ -9,8 +9,8 @@ sudo apt-get -y install openjdk-8-jdk-headless # wget --continue https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz # tar -xzf hadoop-3* # tar -xzf spark-3* -# mv hadoop-3* -# mv spark-3* +# mv hadoop-3* /usr/local/hadoop +# mv spark-3* /usr/local/spark # echo "export HADOOP_HOME=/usr/local/hadoop" >> ~/.bashrc # echo "export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin" >> ~/.bashrc @@ -55,6 +55,6 @@ source ~/.bashrc wget --continue 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' gzip -d hits.tsv.gz chmod 777 ~ hits.tsv -$HADOOP_HOME/bin/hdfs dfs -put hits.tsv / +hdfs dfs -put hits.tsv / -$SPARK_HOME/bin/spark-shell --master local -i ClickBenchRunner.scala > log.txt +$SPARK_HOME/bin/spark-shell --master local -i ClickBenchRunner.scala From eeff7a6cc9000a51c8a363e084a24bd740d18fda Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Mon, 5 Feb 2024 16:28:11 +0300 Subject: [PATCH 09/13] Create log.txt --- spark/log.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 spark/log.txt diff --git a/spark/log.txt b/spark/log.txt new file mode 100644 index 000000000..b334aa948 --- /dev/null +++ b/spark/log.txt @@ -0,0 +1,44 @@ +Creating table time: 236 ms +Query 0 | Time: 18325 ms +Query 1 | Time: 414 ms +Query 2 | Time: 378 ms +Query 3 | Time: 240 ms +Query 4 | Time: 2384 ms +Query 5 | Time: 1577 ms +Query 6 | Time: 246 ms +Query 7 | Time: 988 ms +Query 8 | Time: 2020 ms +Query 9 | Time: 2689 ms +Query 10 | Time: 3097 ms +Query 11 | Time: 2011 ms +Query 12 | Time: 950 ms +Query 13 | Time: 1424 ms +Query 14 | Time: 715 ms +Query 15 | Time: 832 ms +Query 16 | Time: 849 ms +Query 17 | Time: 773 ms +Query 18 | Time: 1053 ms +Query 19 | Time: 143 ms +Query 20 | Time: 149 ms +Query 21 | Time: 854 ms +Query 22 | Time: 1877 ms +Query 23 | Time: 554 ms +Query 24 | Time: 216 ms +Query 25 | Time: 154 ms +Query 26 | Time: 161 ms +Query 27 | Time: 1060 ms +Query 28 | Time: 992 ms +Query 29 | Time: 642 ms +Query 30 | Time: 1096 ms +Query 31 | Time: 981 ms +Query 32 | Time: 1099 ms +Query 33 | Time: 1055 ms +Query 34 | Time: 822 ms +Query 35 | Time: 692 ms +Query 36 | Time: 929 ms +Query 37 | Time: 855 ms +Query 38 | Time: 790 ms +Query 39 | Time: 800 ms +Query 40 | Time: 704 ms +Query 41 | Time: 814 ms +Query 42 | Time: 923 ms From 67e5025f262ba29492343e8bfae68dc769af6030 Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Mon, 5 Feb 2024 16:34:29 +0300 Subject: [PATCH 10/13] Added log-file support --- spark/ClickBenchRunner.scala | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/spark/ClickBenchRunner.scala b/spark/ClickBenchRunner.scala index 0d39a1dd0..2ff15a983 100644 --- a/spark/ClickBenchRunner.scala +++ b/spark/ClickBenchRunner.scala @@ -1,9 +1,17 @@ import org.apache.spark.sql.{SparkSession, SQLContext} import java.sql.Statement +import java.io.{File, FileWriter} +import scala.io.Source +import scala.io.Codec import org.apache.spark.sql.types._ val spark = SparkSession.builder.enableHiveSupport().getOrCreate() spark.sql("DROP TABLE IF EXISTS hits") +val logFile = "log.txt" + +val file = new File(logFile) +val writer = new FileWriter(file) + val schema = StructType( List( @@ -119,7 +127,7 @@ val df = spark.read.schema(schema).option("header", "false").option("sep", "\t") df.createOrReplaceTempView("hits") val endTable = System.nanoTime() val timeElapsedTable = (endTable - startTable) / 1000000 -println(s"Creating table time: $timeElapsedTable ms") +writer.write(s"Creating table time: $timeElapsedTable ms") val queries = Source.fromFile("queries.sql").getLines().toList var itr: Int = 0 @@ -128,7 +136,7 @@ var itr: Int = 0 val result = spark.sql(query) val end = System.nanoTime() val timeElapsed = (end - start) / 1000000 - println(s"Query $itr | Time: $timeElapsed ms") + writer.write(s"Query $itr | Time: $timeElapsed ms") itr += 1 }) From faff18be4fa7b87a85e2eadfcec012134b0b4d2e Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 7 Feb 2024 10:43:14 +0300 Subject: [PATCH 11/13] Spark & HDFS folders updated --- spark/benchmark.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spark/benchmark.sh b/spark/benchmark.sh index e2dba9500..6975bdab0 100644 --- a/spark/benchmark.sh +++ b/spark/benchmark.sh @@ -9,8 +9,10 @@ sudo apt-get -y install openjdk-8-jdk-headless # wget --continue https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz # tar -xzf hadoop-3* # tar -xzf spark-3* -# mv hadoop-3* /usr/local/hadoop -# mv spark-3* /usr/local/spark +# sudo mkdir -p /urs/local/hadoop +# sudo mkdir -p /usr/local/spark +# sudo mv hadoop-3* /usr/local/hadoop +# sudo mv spark-3* /usr/local/spark # echo "export HADOOP_HOME=/usr/local/hadoop" >> ~/.bashrc # echo "export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin" >> ~/.bashrc From 3f0fbf663faa7537ecf5f0fd8de384728686b81f Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 21 Feb 2024 15:09:14 +0300 Subject: [PATCH 12/13] Fixed issues with JAVA_HOME, ssh connection and correct HDFS scripts execution --- spark/benchmark.sh | 53 ++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/spark/benchmark.sh b/spark/benchmark.sh index 6975bdab0..dc6d35d5c 100644 --- a/spark/benchmark.sh +++ b/spark/benchmark.sh @@ -5,23 +5,28 @@ sudo apt-get -y install openjdk-8-jdk-headless ### If there is no HDFS and Spark on your system: +export HADOOP_DIR="/usr/local/hadoop" +export SPARK_DIR="/usr/local/spark" +export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" + # wget --continue https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz -# wget --continue https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz -# tar -xzf hadoop-3* -# tar -xzf spark-3* -# sudo mkdir -p /urs/local/hadoop -# sudo mkdir -p /usr/local/spark -# sudo mv hadoop-3* /usr/local/hadoop -# sudo mv spark-3* /usr/local/spark - -# echo "export HADOOP_HOME=/usr/local/hadoop" >> ~/.bashrc -# echo "export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin" >> ~/.bashrc -# echo "export SPARK_HOME=/usr/local/spark>" >> ~/.bashrc -# echo "export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin" >> ~/.bashrc +# wget --continue https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0.tgz +# tar -xzf hadoop-3*.tar.gz +# tar -xzf spark-3*.tgz +# sudo rm -f spark-3*.tgz hadoop-3*.tar.gz +# sudo mv spark-3* spark +# sudo mv hadoop-3* hadoop +# sudo mv spark $HADOOP_DIR +# sudo mv hadoop $SPARK_DIR + +echo "export HADOOP_HOME=$HADOOP_DIR" >> ~/.bashrc +echo "export SPARK_HOME=$SPARK_DIR" >> ~/.bashrc +echo "export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH" >> ~/.bashrc +# source ~/.bashrc ### To configure HDFS: -#cd /usr/local/hadoop/etc/hadoop +#cd $HADOOP_HOME/etc/hadoop #cp core-site.xml core-site.xml.bak #cp hdfs-site.xml hdfs-site.xml.bak # echo " @@ -36,23 +41,29 @@ sudo apt-get -y install openjdk-8-jdk-headless # 1 # # " > hdfs-site.xml +# echo "export JAVA_HOME=$JAVA_HOME" >> hadoop-env.sh ### To configure Spark: -# cd /usr/local/spark/conf +# cd $SPARK_HOME/conf # cp spark-env.sh.template spark-env.sh -# echo "export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop" >> spark-env.sh +# echo "export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop" >> spark-env.sh # echo "export SPARK_MASTER_HOST=localhost" >> spark-env.sh -### To run Spark and HDFS: +### To run passless connection on localhost: -# start-master.sh -# start-slave.sh spark://localhost:7077 -# hdfs namenode -format -# start-dfs.sh +# ssh-keygen -t rsa +# cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +# chmod og-wx ~/.ssh/authorized_keys +### To run Spark and HDFS: -source ~/.bashrc +# cd $SPARK_HOME/sbin +# ./start-master.sh +# ./start-slave.sh spark://localhost:7077 +# cd $HADOOP_HOME/sbin +# hdfs namenode -format +# ./start-dfs.sh wget --continue 'https://datasets.clickhouse.com/hits_compatible/hits.tsv.gz' gzip -d hits.tsv.gz From cdbe14d41efdc735597d9611b31b1d3d76033d6d Mon Sep 17 00:00:00 2001 From: DoubleMindy Date: Wed, 21 Feb 2024 15:22:31 +0300 Subject: [PATCH 13/13] Minor refactoring --- spark/benchmark.sh | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/spark/benchmark.sh b/spark/benchmark.sh index dc6d35d5c..c59202f51 100644 --- a/spark/benchmark.sh +++ b/spark/benchmark.sh @@ -8,20 +8,25 @@ sudo apt-get -y install openjdk-8-jdk-headless export HADOOP_DIR="/usr/local/hadoop" export SPARK_DIR="/usr/local/spark" export JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" +export HADOOP_VERSION="3.3.6" +export SPARK_VERSION="3.5.0" -# wget --continue https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz -# wget --continue https://downloads.apache.org/spark/spark-3.5.0/spark-3.5.0.tgz -# tar -xzf hadoop-3*.tar.gz -# tar -xzf spark-3*.tgz -# sudo rm -f spark-3*.tgz hadoop-3*.tar.gz -# sudo mv spark-3* spark -# sudo mv hadoop-3* hadoop -# sudo mv spark $HADOOP_DIR -# sudo mv hadoop $SPARK_DIR - -echo "export HADOOP_HOME=$HADOOP_DIR" >> ~/.bashrc -echo "export SPARK_HOME=$SPARK_DIR" >> ~/.bashrc -echo "export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH" >> ~/.bashrc + +# export hadoop_file=hadoop-$HADOOP_VERSION.tar.gz +# export spark_file=spark-$SPARK_VERSION-bin-hadoop3.tgz +# wget --continue https://downloads.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/$hadoop_file +# wget --continue https://downloads.apache.org/spark/spark-$SPARK_VERSION/$spark_file +# sudo tar -xzf $hadoop_file $spark_file +# sudo rm -f $spark_file $hadoop_file +# sudo rm -rf $HADOOP_DIR $SPARK_DIR +# sudo mv spark-$SPARK_VERSION spark +# sudo mv spark $SPARK_DIR +# sudo mv hadoop-$HADOOP_VERSION hadoop +# sudo mv hadoop $HADOOP_DIR + +# echo "export HADOOP_HOME=$HADOOP_DIR" >> ~/.bashrc +# echo "export SPARK_HOME=$SPARK_DIR" >> ~/.bashrc +# echo "export PATH=$SPARK_HOME/bin:$SPARK_HOME/sbin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH" >> ~/.bashrc # source ~/.bashrc ### To configure HDFS: