Merge pull request #24 from shuiyisong/add-greptimedb-formal

rschu1ze · web-flow · commit ef896e265c0e · 2025-03-17T09:28:26.000+01:00
Add GreptimeDB
diff --git a/README.md b/README.md
@@ -148,6 +148,7 @@ While the main benchmark uses a specific machine configuration for reproducibili
 - [ ] FerretDB
 - [ ] Apache Drill
 - [ ] GlareDB
+- [x] GreptimeDB
 
 ## Similar projects
 
diff --git a/greptimedb/count.sh b/greptimedb/count.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+curl -s --fail http://localhost:4000/v1/sql \
+    -H 'Content-Type: application/x-www-form-urlencoded' \
+    -d "sql=select count(*) as cnt from bluesky"  \
+    -d "format=json" \
+    | grep -o "cnt\":[0-9]*" | sed 's/cnt\"://g'
diff --git a/greptimedb/data_size.sh b/greptimedb/data_size.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+curl -s --fail http://localhost:4000/v1/sql \
+    -H 'Content-Type: application/x-www-form-urlencoded' \
+    -d "sql=SELECT sum(r.sst_size) as data_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'"  \
+    -d "format=json"  \
+    | grep -o "data_size\":[0-9]*" | sed 's/data_size\"://g'
diff --git a/greptimedb/drop_tables.sh b/greptimedb/drop_tables.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+echo "Stopping GreptimeDB"
+pidof greptime && kill `pidof greptime`
+
+echo "Dropping all data"
+rm -rf ./greptimedb_data
diff --git a/greptimedb/index_size.sh b/greptimedb/index_size.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+curl -s --fail http://localhost:4000/v1/sql \
+    -H 'Content-Type: application/x-www-form-urlencoded' \
+    -d "sql=SELECT sum(r.index_size) as index_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'"  \
+    -d "format=json"  \
+    | grep -o "index_size\":[0-9]*" | sed 's/index_size\"://g'
diff --git a/greptimedb/install.sh b/greptimedb/install.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+RELEASE_VERSION=v0.13.0-nightly-20250315
+
+# download greptimedb
+wget -N "https://github.com/GreptimeTeam/greptimedb/releases/download/${RELEASE_VERSION}/greptime-linux-amd64-${RELEASE_VERSION}.tar.gz"
+tar xzf greptime-linux-amd64-${RELEASE_VERSION}.tar.gz
+mv greptime-linux-amd64-${RELEASE_VERSION}/greptime ./
+rm -rf greptime-linux-amd64-${RELEASE_VERSION}
diff --git a/greptimedb/load_data.sh b/greptimedb/load_data.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 4 ]]; then
+    echo "Usage: $0 <DATA_DIRECTORY> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
+    exit 1
+fi
+
+# Arguments
+DATA_DIRECTORY="$1"
+MAX_FILES="$2"
+SUCCESS_LOG="$3"
+ERROR_LOG="$4"
+
+# Validate arguments
+[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
+[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
+
+pushd $DATA_DIRECTORY
+counter=0
+for file in $(ls *.json.gz | head -n $MAX_FILES); do
+    echo "Processing file: $file"
+
+    curl "http://localhost:4000/v1/events/logs?table=bluesky&pipeline_name=jsonbench&ignore_errors=true" \
+         -H "Content-Type: application/x-ndjson" \
+         -H "Content-Encoding: gzip" \
+         --data-binary @$file
+    echo ""
+
+    first_attempt=$?
+    if [[ $first_attempt -eq 0 ]]; then
+        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG"
+    else
+        echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed for $file. Giving up." >> "$ERROR_LOG"
+    fi
+
+    counter=$((counter + 1))
+    if [[ $counter -ge $MAX_FILES ]]; then
+        break
+    fi
+done
+
+curl -XPOST -H 'Content-Type: application/x-www-form-urlencoded' \
+          http://localhost:4000/v1/sql \
+          -d "sql=admin flush_table('bluesky')" \
+          -d "format=json"
+
+echo -e "\nLoaded $MAX_FILES data files from $DATA_DIRECTORY to GreptimeDB."
diff --git a/greptimedb/main.sh b/greptimedb/main.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# Default data directory
+DEFAULT_DATA_DIRECTORY=~/data/bluesky
+
+# Allow the user to optionally provide the data directory as an argument
+DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
+
+# Define success and error log files
+SUCCESS_LOG="${2:-success.log}"
+ERROR_LOG="${3:-error.log}"
+
+# Define prefix for output files
+OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
+
+# Check if the directory exists
+if [[ ! -d "$DATA_DIRECTORY" ]]; then
+    echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
+    exit 1
+fi
+
+echo "Select the dataset size to benchmark:"
+echo "1) 1m (default)"
+echo "2) 10m"
+echo "3) 100m"
+echo "4) 1000m"
+echo "5) all"
+read -p "Enter the number corresponding to your choice: " choice
+
+./install.sh
+
+benchmark() {
+    local size=$1
+    # Check DATA_DIRECTORY contains the required number of files to run the benchmark
+    file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
+    if (( file_count < size )); then
+        echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
+        exit 1
+    fi
+
+    ./start.sh
+    ./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
+    ./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
+    ./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
+    ./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
+    ./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
+    ./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
+    ./drop_tables.sh 
+}
+
+case $choice in
+    2)
+        benchmark 10
+        ;;
+    3)
+        benchmark 100
+        ;;
+    4)
+        benchmark 1000
+        ;;
+    5)
+        benchmark 1
+        benchmark 10
+        benchmark 100
+        benchmark 1000
+        ;;
+    *)
+        benchmark 1
+        ;;
+esac
diff --git a/greptimedb/pipeline.yaml b/greptimedb/pipeline.yaml
@@ -0,0 +1,35 @@
+processors:
+  - epoch:
+      fields:
+        - time_us
+      resolution: microsecond
+  - simple_extract:
+      fields:
+        - commit, commit_collection
+      key: "collection"
+      ignore_missing: true
+  - simple_extract:
+      fields:
+        - commit, commit_operation
+      key: "operation"
+      ignore_missing: true
+
+transform:
+  - fields:
+      - did
+    type: string
+  - fields:
+      - kind
+      - commit_collection
+      - commit_operation
+    type: string
+    index: inverted
+    tag: true
+  - fields:
+      - commit
+    type: json
+    on_failure: ignore
+  - fields:
+      - time_us
+    type: epoch, us
+    index: timestamp
diff --git a/greptimedb/queries.sql b/greptimedb/queries.sql
@@ -0,0 +1,5 @@
+SELECT commit_collection AS event, count(1) AS cnt FROM bluesky GROUP BY event ORDER BY cnt DESC;
+SELECT commit_collection AS event, count(1) AS cnt, count(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' GROUP BY event ORDER BY cnt DESC;
+SELECT commit_collection AS event, date_part('hour', time_us) AS hour_of_day, count(1) AS cnt FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
+SELECT did AS user_id, min(time_us) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
+SELECT did AS user_id, date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
diff --git a/greptimedb/queries_formatted.sql b/greptimedb/queries_formatted.sql
@@ -0,0 +1,59 @@
+------------------------------------------------------------------------------------------------------------------------
+-- Q1 - Top event types
+------------------------------------------------------------------------------------------------------------------------
+SELECT commit_collection AS event,
+       count(1) AS cnt
+FROM bluesky
+GROUP BY event
+ORDER BY cnt DESC;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q2 - Top event types together with unique users per event type
+------------------------------------------------------------------------------------------------------------------------
+SELECT commit_collection AS event,
+       count(1) AS cnt,
+       count(DISTINCT did) AS users
+FROM bluesky
+WHERE kind = 'commit'
+  AND commit_operation = 'create'
+GROUP BY event
+ORDER BY cnt DESC;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q3 - When do people use BlueSky
+------------------------------------------------------------------------------------------------------------------------
+SELECT commit_collection AS event,
+       date_part('hour', time_us) AS hour_of_day,
+       count(1) AS cnt
+FROM bluesky
+WHERE kind = 'commit'
+  AND commit_operation = 'create'
+  AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
+GROUP BY event,
+         hour_of_day
+ORDER BY hour_of_day,
+         event;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q4 - top 3 post veterans
+------------------------------------------------------------------------------------------------------------------------
+SELECT did AS user_id,
+       min(time_us) AS first_post_ts
+FROM bluesky
+WHERE kind = 'commit'
+  AND commit_operation = 'create'
+  AND commit_collection = 'app.bsky.feed.post'
+GROUP BY user_id
+ORDER BY first_post_ts ASC LIMIT 3;
+
+------------------------------------------------------------------------------------------------------------------------
+-- Q5 - top 3 users with longest activity
+------------------------------------------------------------------------------------------------------------------------
+SELECT did AS user_id,
+       date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span
+FROM bluesky
+WHERE kind = 'commit'
+  AND commit_operation = 'create'
+  AND commit_collection = 'app.bsky.feed.post'
+GROUP BY user_id
+ORDER BY activity_span DESC LIMIT 3;
diff --git a/greptimedb/results/_query_results/_m6i.8xlarge_bluesky_1m.query_results b/greptimedb/results/_query_results/_m6i.8xlarge_bluesky_1m.query_results
@@ -0,0 +1,72 @@
+------------------------------------------------------------------------------------------------------------------------
+Result for query Q1:
+
++----------------------------+--------+
+| event                      | cnt    |
++----------------------------+--------+
+| app.bsky.feed.like         | 448944 |
+| app.bsky.graph.follow      | 360374 |
+| app.bsky.feed.post         |  90816 |
+| app.bsky.feed.repost       |  58540 |
+| app.bsky.graph.block       |  14040 |
+| app.bsky.actor.profile     |  11762 |
+| app.bsky.graph.listitem    |   8103 |
+| NULL                       |   5328 |
+| app.bsky.graph.listblock   |    895 |
+| app.bsky.graph.starterpack |    405 |
+| app.bsky.graph.list        |    356 |
+| app.bsky.feed.threadgate   |    255 |
+| app.bsky.feed.postgate     |    104 |
+| app.bsky.feed.generator    |     74 |
+| app.bsky.labeler.service   |      4 |
++----------------------------+--------+
+------------------------------------------------------------------------------------------------------------------------
+Result for query Q2:
+
++----------------------------+--------+--------+
+| event                      | cnt    | users  |
++----------------------------+--------+--------+
+| app.bsky.feed.like         | 444523 | 117617 |
+| app.bsky.graph.follow      | 337978 |  63957 |
+| app.bsky.feed.post         |  86812 |  50464 |
+| app.bsky.feed.repost       |  56993 |  26581 |
+| app.bsky.graph.block       |  13838 |   5785 |
+| app.bsky.graph.listitem    |   7568 |   1078 |
+| app.bsky.actor.profile     |   5337 |   5337 |
+| app.bsky.graph.listblock   |    860 |    449 |
+| app.bsky.graph.list        |    259 |    218 |
+| app.bsky.feed.threadgate   |    228 |    196 |
+| app.bsky.graph.starterpack |    104 |    101 |
+| app.bsky.feed.postgate     |    101 |     82 |
+| app.bsky.feed.generator    |     10 |      9 |
++----------------------------+--------+--------+
+------------------------------------------------------------------------------------------------------------------------
+Result for query Q3:
+
++----------------------+-------------+--------+
+| event                | hour_of_day | cnt    |
++----------------------+-------------+--------+
+| app.bsky.feed.like   |          16 | 444523 |
+| app.bsky.feed.post   |          16 |  86812 |
+| app.bsky.feed.repost |          16 |  56993 |
++----------------------+-------------+--------+
+------------------------------------------------------------------------------------------------------------------------
+Result for query Q4:
+
++----------------------------------+----------------------------+
+| user_id                          | first_post_ts              |
++----------------------------------+----------------------------+
+| did:plc:yj3sjq3blzpynh27cumnp5ks | 2024-11-21 16:25:49.000167 |
+| did:plc:l5o3qjrmfztir54cpwlv2eme | 2024-11-21 16:25:49.001905 |
+| did:plc:s4bwqchfzm6gjqfeb6mexgbu | 2024-11-21 16:25:49.003907 |
++----------------------------------+----------------------------+
+------------------------------------------------------------------------------------------------------------------------
+Result for query Q5:
+
++----------------------------------+---------------+
+| user_id                          | activity_span |
++----------------------------------+---------------+
+| did:plc:tsyymlun4eqjuw7hqrhmwagd |        813000 |
+| did:plc:doxhhgtxqiv47tmcovpbcqai |        811000 |
+| did:plc:3ug235sfy2pz7cawmpsftb65 |        811000 |
++----------------------------------+---------------+
diff --git a/greptimedb/results/m6i.8xlarge_bluesky_1000m.json b/greptimedb/results/m6i.8xlarge_bluesky_1000m.json
@@ -0,0 +1,36 @@
+{
+  "system": "GreptimeDB",
+  "version": "v0.13.0-nightly-20250315",
+  "os": "Ubuntu 24.04",
+  "date": "2025-03-17",
+  "machine": "m6i.8xlarge, 10000gib gp3",
+  "cluster_size": 1,
+  "comment": "",
+  "retains_structure": "yes",
+  "tags": [
+  ],
+  "dataset_size": 1000000000,
+  "dataset_size_readable": "1000m",
+  "num_loaded_documents": 999999233,
+  "data_compression": "auto",
+  "total_size": 108807967659,
+  "total_size_readable": "109 GB",
+  "data_size": 108803194662,
+  "data_size_readable": "109 GB",
+  "index_size": 4642437,
+  "index_size_readable": "5 MB",
+  "result": [
+    [13.656, 1.532, 1.521],
+    [93.498, 20.821, 21.903],
+    [2.148, 2.058, 2.095],
+    [0.522, 0.461, 0.439],
+    [1.773, 1.68, 1.675]
+  ],
+  "result_readable": [
+    "13.66 sec, 1.53 sec, 1.52 sec",
+    "93.50 sec, 20.82 sec, 21.90 sec",
+    "2.15 sec, 2.06 sec, 2.10 sec",
+    "0.52 sec, 0.46 sec, 0.44 sec",
+    "1.77 sec, 1.68 sec, 1.68 sec"
+  ]
+}
diff --git a/greptimedb/results/m6i.8xlarge_bluesky_100m.json b/greptimedb/results/m6i.8xlarge_bluesky_100m.json
diff --git a/greptimedb/results/m6i.8xlarge_bluesky_10m.json b/greptimedb/results/m6i.8xlarge_bluesky_10m.json
diff --git a/greptimedb/results/m6i.8xlarge_bluesky_1m.json b/greptimedb/results/m6i.8xlarge_bluesky_1m.json
diff --git a/greptimedb/run_queries.sh b/greptimedb/run_queries.sh
diff --git a/greptimedb/start.sh b/greptimedb/start.sh
diff --git a/greptimedb/total_size.sh b/greptimedb/total_size.sh