Skip to content

Commit ef896e2

Browse files
authored
Merge pull request #24 from shuiyisong/add-greptimedb-formal
Add GreptimeDB
2 parents c48c325 + df18246 commit ef896e2

19 files changed

Lines changed: 533 additions & 0 deletions

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ While the main benchmark uses a specific machine configuration for reproducibili
148148
- [ ] FerretDB
149149
- [ ] Apache Drill
150150
- [ ] GlareDB
151+
- [x] GreptimeDB
151152

152153
## Similar projects
153154

greptimedb/count.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
curl -s --fail http://localhost:4000/v1/sql \
4+
-H 'Content-Type: application/x-www-form-urlencoded' \
5+
-d "sql=select count(*) as cnt from bluesky" \
6+
-d "format=json" \
7+
| grep -o "cnt\":[0-9]*" | sed 's/cnt\"://g'

greptimedb/data_size.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
curl -s --fail http://localhost:4000/v1/sql \
4+
-H 'Content-Type: application/x-www-form-urlencoded' \
5+
-d "sql=SELECT sum(r.sst_size) as data_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'" \
6+
-d "format=json" \
7+
| grep -o "data_size\":[0-9]*" | sed 's/data_size\"://g'

greptimedb/drop_tables.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
echo "Stopping GreptimeDB"
4+
pidof greptime && kill `pidof greptime`
5+
6+
echo "Dropping all data"
7+
rm -rf ./greptimedb_data

greptimedb/index_size.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
3+
curl -s --fail http://localhost:4000/v1/sql \
4+
-H 'Content-Type: application/x-www-form-urlencoded' \
5+
-d "sql=SELECT sum(r.index_size) as index_size FROM information_schema.REGION_STATISTICS r LEFT JOIN information_schema.TABLES t on r.table_id = t.table_id WHERE t.table_name = 'bluesky'" \
6+
-d "format=json" \
7+
| grep -o "index_size\":[0-9]*" | sed 's/index_size\"://g'

greptimedb/install.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
3+
RELEASE_VERSION=v0.13.0-nightly-20250315
4+
5+
# download greptimedb
6+
wget -N "https://github.com/GreptimeTeam/greptimedb/releases/download/${RELEASE_VERSION}/greptime-linux-amd64-${RELEASE_VERSION}.tar.gz"
7+
tar xzf greptime-linux-amd64-${RELEASE_VERSION}.tar.gz
8+
mv greptime-linux-amd64-${RELEASE_VERSION}/greptime ./
9+
rm -rf greptime-linux-amd64-${RELEASE_VERSION}

greptimedb/load_data.sh

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 4 ]]; then
5+
echo "Usage: $0 <DATA_DIRECTORY> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DATA_DIRECTORY="$1"
11+
MAX_FILES="$2"
12+
SUCCESS_LOG="$3"
13+
ERROR_LOG="$4"
14+
15+
# Validate arguments
16+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
17+
[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
18+
19+
pushd $DATA_DIRECTORY
20+
counter=0
21+
for file in $(ls *.json.gz | head -n $MAX_FILES); do
22+
echo "Processing file: $file"
23+
24+
curl "http://localhost:4000/v1/events/logs?table=bluesky&pipeline_name=jsonbench&ignore_errors=true" \
25+
-H "Content-Type: application/x-ndjson" \
26+
-H "Content-Encoding: gzip" \
27+
--data-binary @$file
28+
echo ""
29+
30+
first_attempt=$?
31+
if [[ $first_attempt -eq 0 ]]; then
32+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file." >> "$SUCCESS_LOG"
33+
else
34+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Failed for $file. Giving up." >> "$ERROR_LOG"
35+
fi
36+
37+
counter=$((counter + 1))
38+
if [[ $counter -ge $MAX_FILES ]]; then
39+
break
40+
fi
41+
done
42+
43+
curl -XPOST -H 'Content-Type: application/x-www-form-urlencoded' \
44+
http://localhost:4000/v1/sql \
45+
-d "sql=admin flush_table('bluesky')" \
46+
-d "format=json"
47+
48+
echo -e "\nLoaded $MAX_FILES data files from $DATA_DIRECTORY to GreptimeDB."

greptimedb/main.sh

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/bin/bash
2+
3+
# Default data directory
4+
DEFAULT_DATA_DIRECTORY=~/data/bluesky
5+
6+
# Allow the user to optionally provide the data directory as an argument
7+
DATA_DIRECTORY="${1:-$DEFAULT_DATA_DIRECTORY}"
8+
9+
# Define success and error log files
10+
SUCCESS_LOG="${2:-success.log}"
11+
ERROR_LOG="${3:-error.log}"
12+
13+
# Define prefix for output files
14+
OUTPUT_PREFIX="${4:-_m6i.8xlarge}"
15+
16+
# Check if the directory exists
17+
if [[ ! -d "$DATA_DIRECTORY" ]]; then
18+
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
19+
exit 1
20+
fi
21+
22+
echo "Select the dataset size to benchmark:"
23+
echo "1) 1m (default)"
24+
echo "2) 10m"
25+
echo "3) 100m"
26+
echo "4) 1000m"
27+
echo "5) all"
28+
read -p "Enter the number corresponding to your choice: " choice
29+
30+
./install.sh
31+
32+
benchmark() {
33+
local size=$1
34+
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
35+
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
36+
if (( file_count < size )); then
37+
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
38+
exit 1
39+
fi
40+
41+
./start.sh
42+
./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
43+
./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
44+
./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
45+
./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
46+
./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
47+
./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
48+
./drop_tables.sh
49+
}
50+
51+
case $choice in
52+
2)
53+
benchmark 10
54+
;;
55+
3)
56+
benchmark 100
57+
;;
58+
4)
59+
benchmark 1000
60+
;;
61+
5)
62+
benchmark 1
63+
benchmark 10
64+
benchmark 100
65+
benchmark 1000
66+
;;
67+
*)
68+
benchmark 1
69+
;;
70+
esac

greptimedb/pipeline.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
processors:
2+
- epoch:
3+
fields:
4+
- time_us
5+
resolution: microsecond
6+
- simple_extract:
7+
fields:
8+
- commit, commit_collection
9+
key: "collection"
10+
ignore_missing: true
11+
- simple_extract:
12+
fields:
13+
- commit, commit_operation
14+
key: "operation"
15+
ignore_missing: true
16+
17+
transform:
18+
- fields:
19+
- did
20+
type: string
21+
- fields:
22+
- kind
23+
- commit_collection
24+
- commit_operation
25+
type: string
26+
index: inverted
27+
tag: true
28+
- fields:
29+
- commit
30+
type: json
31+
on_failure: ignore
32+
- fields:
33+
- time_us
34+
type: epoch, us
35+
index: timestamp

greptimedb/queries.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
SELECT commit_collection AS event, count(1) AS cnt FROM bluesky GROUP BY event ORDER BY cnt DESC;
2+
SELECT commit_collection AS event, count(1) AS cnt, count(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' GROUP BY event ORDER BY cnt DESC;
3+
SELECT commit_collection AS event, date_part('hour', time_us) AS hour_of_day, count(1) AS cnt FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4+
SELECT did AS user_id, min(time_us) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5+
SELECT did AS user_id, date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;

0 commit comments

Comments
 (0)