Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions greptimedb/compare_query_results.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

if [[ $# -lt 1 ]]; then
echo "Usage: $0 <DATASET>" >&2
exit 1
fi

DATASET="$1"
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
EXPECTED_FILE="${SCRIPT_DIR}/results/_query_results/expected_${DATASET}"
ACTUAL_FILE="${SCRIPT_DIR}/results/_query_results/actual_${DATASET}"

if [[ ! -f "$EXPECTED_FILE" ]]; then
echo "Error: expected query results file not found: $EXPECTED_FILE" >&2
exit 1
fi

echo "Comparing query results:" >&2
echo " expected: $EXPECTED_FILE" >&2
echo " actual: $ACTUAL_FILE" >&2

if ! cmp -s "$EXPECTED_FILE" "$ACTUAL_FILE"; then
echo "Error: actual query results differ from expected query results." >&2
diff -u "$EXPECTED_FILE" "$ACTUAL_FILE" >&2 || true
exit 1
fi

echo "Query results match expected output." >&2
11 changes: 11 additions & 0 deletions greptimedb/create.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

echo "Creating pipeline \"jsonbench\" ..."
curl -i -X POST 'http://localhost:4000/v1/events/pipelines/jsonbench' -F 'file=@pipeline.yaml'

echo "Creating table \"bluesky\" ..."
DDL_SQL=$(tr '\n' ' ' < ddl.sql)
curl -i -X POST -H 'Content-Type: application/x-www-form-urlencoded' \
http://localhost:4000/v1/sql \
-d "sql=$DDL_SQL" \
-d "format=json"
7 changes: 7 additions & 0 deletions greptimedb/ddl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CREATE TABLE bluesky (
"data" JSON2,
time_us TimestampMicrosecond TIME INDEX
) WITH (
'append_mode' = 'true',
'sst_format' = 'flat'
)
4 changes: 2 additions & 2 deletions greptimedb/load_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ counter=0
for file in $(ls *.json.gz | head -n $MAX_FILES); do
echo "Processing file: $file"

curl "http://localhost:4000/v1/events/logs?table=bluesky&pipeline_name=jsonbench&ignore_errors=true" \
-H "Content-Type: application/x-ndjson" \
curl -i "http://localhost:4000/v1/ingest?table=bluesky&pipeline_name=jsonbench&skip_error=true" \
-H "Content-Type: text/plain" \
-H "Content-Encoding: gzip" \
--data-binary @$file
echo ""
Expand Down
17 changes: 16 additions & 1 deletion greptimedb/main.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

set -o pipefail

DEFAULT_CHOICE=ask
DEFAULT_DATA_DIRECTORY=~/data/bluesky

Expand All @@ -16,6 +18,14 @@ ERROR_LOG="${4:-error.log}"
# Define prefix for output files
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"

# Run install.sh unless explicitly disabled.
INSTALL="${6:-true}"

if [[ "$INSTALL" != "true" && "$INSTALL" != "false" ]]; then
echo "Error: install must be either 'true' or 'false'."
exit 1
fi

# Check if the directory exists
if [[ ! -d "$DATA_DIRECTORY" ]]; then
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
Expand All @@ -32,7 +42,9 @@ if [ "$CHOICE" = "ask" ]; then
read -p "Enter the number corresponding to your choice: " CHOICE
fi

./install.sh
if [[ "$INSTALL" == "true" ]]; then
./install.sh
fi

benchmark() {
local size=$1
Expand All @@ -44,12 +56,15 @@ benchmark() {
fi

./start.sh
./create.sh
./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size"
./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
./query_results.sh "${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
./compare_query_results.sh "${size}m" || return 1
./drop_tables.sh
}

Expand Down
37 changes: 14 additions & 23 deletions greptimedb/pipeline.yaml
Original file line number Diff line number Diff line change
@@ -1,34 +1,25 @@
version: 2

processors:
- json_parse:
fields:
- message, data
ignore_missing: true
- simple_extract:
fields:
- data, time_us
key: "time_us"
ignore_missing: false
- epoch:
fields:
- time_us
resolution: microsecond
- simple_extract:
fields:
- commit, commit_collection
key: "collection"
ignore_missing: true
- simple_extract:
- select:
fields:
- commit, commit_operation
key: "operation"
ignore_missing: true
- time_us
- data

transform:
- fields:
- did
type: string
- fields:
- kind
- commit_collection
- commit_operation
type: string
index: inverted
tag: true
- fields:
- commit
type: json
on_failure: ignore
- fields:
- time_us
type: epoch, us
Expand Down
10 changes: 5 additions & 5 deletions greptimedb/queries.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SELECT commit_collection AS event, count(1) AS cnt FROM bluesky GROUP BY event ORDER BY cnt DESC;
SELECT commit_collection AS event, count(1) AS cnt, count(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' GROUP BY event ORDER BY cnt DESC;
SELECT commit_collection AS event, date_part('hour', time_us) AS hour_of_day, count(1) AS cnt FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT did AS user_id, min(time_us) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
SELECT did AS user_id, date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
SELECT data.commit.collection AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC, event ASC;
SELECT data.commit.collection AS event, count() AS count, count(DISTINCT data.did) AS users FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' GROUP BY event ORDER BY count DESC, event ASC;
SELECT data.commit.collection AS event, date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day, count() AS count FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
SELECT data.did::String as user_id, min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC, user_id DESC LIMIT 3;
SELECT data.did::String as user_id, date_part('epoch', max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) - min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))) AS activity_span FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC, user_id DESC LIMIT 3;
65 changes: 34 additions & 31 deletions greptimedb/queries_formatted.sql
Original file line number Diff line number Diff line change
@@ -1,59 +1,62 @@
------------------------------------------------------------------------------------------------------------------------
-- Q1 - Top event types
------------------------------------------------------------------------------------------------------------------------
SELECT commit_collection AS event,
count(1) AS cnt
SELECT data.commit.collection AS event,
count() AS count
FROM bluesky
GROUP BY event
ORDER BY cnt DESC;
ORDER BY count DESC, event ASC;

------------------------------------------------------------------------------------------------------------------------
-- Q2 - Top event types together with unique users per event type
------------------------------------------------------------------------------------------------------------------------
SELECT commit_collection AS event,
count(1) AS cnt,
count(DISTINCT did) AS users
SELECT data.commit.collection AS event,
count() AS count,
count(DISTINCT data.did) AS users
FROM bluesky
WHERE kind = 'commit'
AND commit_operation = 'create'
WHERE data.kind = 'commit' AND data.commit.operation = 'create'
GROUP BY event
ORDER BY cnt DESC;
ORDER BY count DESC, event ASC;

------------------------------------------------------------------------------------------------------------------------
-- Q3 - When do people use BlueSky
------------------------------------------------------------------------------------------------------------------------
SELECT commit_collection AS event,
date_part('hour', time_us) AS hour_of_day,
count(1) AS cnt
SELECT data.did::String as user_id,
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
FROM bluesky
WHERE kind = 'commit'
AND commit_operation = 'create'
AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
GROUP BY event,
hour_of_day
ORDER BY hour_of_day,
event;
WHERE data.kind = 'commit'
AND data.commit.operation = 'create'
AND data.commit.collection = 'app.bsky.feed.post'
GROUP BY user_id
ORDER BY first_post_ts ASC, user_id DESC
LIMIT 3;

------------------------------------------------------------------------------------------------------------------------
-- Q4 - top 3 post veterans
------------------------------------------------------------------------------------------------------------------------
SELECT did AS user_id,
min(time_us) AS first_post_ts
SELECT data.did::String as user_id,
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
FROM bluesky
WHERE kind = 'commit'
AND commit_operation = 'create'
AND commit_collection = 'app.bsky.feed.post'
WHERE data.kind = 'commit'
AND data.commit.operation = 'create'
AND data.commit.collection = 'app.bsky.feed.post'
GROUP BY user_id
ORDER BY first_post_ts ASC LIMIT 3;
ORDER BY first_post_ts ASC, user_id DESC
LIMIT 3;

------------------------------------------------------------------------------------------------------------------------
-- Q5 - top 3 users with longest activity
------------------------------------------------------------------------------------------------------------------------
SELECT did AS user_id,
date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span
SELECT data.did::String as user_id,
date_part(
'epoch',
max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) -
min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))
) AS activity_span
FROM bluesky
WHERE kind = 'commit'
AND commit_operation = 'create'
AND commit_collection = 'app.bsky.feed.post'
WHERE data.kind = 'commit'
AND data.commit.operation = 'create'
AND data.commit.collection = 'app.bsky.feed.post'
GROUP BY user_id
ORDER BY activity_span DESC LIMIT 3;
ORDER BY activity_span DESC, user_id DESC
LIMIT 3;
36 changes: 36 additions & 0 deletions greptimedb/query_results.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

if [[ $# -lt 1 ]]; then
echo "Usage: $0 <DATASET>" >&2
exit 1
fi

DATASET="$1"
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
OUTPUT_FILE="${SCRIPT_DIR}/results/_query_results/actual_${DATASET}"

mkdir -p "$(dirname "$OUTPUT_FILE")"

QUERY_NUM=1

set -f
{
while IFS= read -r query; do
if [[ "$QUERY_NUM" != "1" ]]; then
echo
fi

echo "------------------------------------------------------------------------------------------------------------------------"
echo "Result for query Q$QUERY_NUM:"
echo

curl -s --fail "http://localhost:4000/v1/sql?db=public&format=table" \
--data-urlencode "sql=$query"
exit_code=$?
if [[ "$exit_code" != "0" ]]; then
echo "Error: query Q$QUERY_NUM failed with exit code $exit_code"
fi

QUERY_NUM=$((QUERY_NUM + 1))
done < "${SCRIPT_DIR}/queries.sql"
} | tee "$OUTPUT_FILE"
73 changes: 73 additions & 0 deletions greptimedb/results/_query_results/expected_10m
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
------------------------------------------------------------------------------------------------------------------------
Result for query Q1:

┌─event────────────────────────┬─count───┐
│ "app.bsky.feed.like" │ 4673017 │
│ "app.bsky.graph.follow" │ 3387006 │
│ "app.bsky.feed.post" │ 930269 │
│ "app.bsky.feed.repost" │ 597193 │
│ "app.bsky.graph.block" │ 142218 │
│ "app.bsky.actor.profile" │ 118646 │
│ "app.bsky.graph.listitem" │ 75984 │
│ null │ 53826 │
│ "app.bsky.graph.listblock" │ 10467 │
│ "app.bsky.graph.starterpack" │ 3619 │
│ "app.bsky.graph.list" │ 3414 │
│ "app.bsky.feed.threadgate" │ 2554 │
│ "app.bsky.feed.postgate" │ 1076 │
│ "app.bsky.feed.generator" │ 686 │
│ "app.bsky.labeler.service" │ 13 │
│ "app.kollective.catalog" │ 6 │
└──────────────────────────────┴─────────┘

------------------------------------------------------------------------------------------------------------------------
Result for query Q2:

┌─event────────────────────────┬─count───┬─users──┐
│ "app.bsky.feed.like" │ 4631335 │ 694849 │
│ "app.bsky.graph.follow" │ 3220534 │ 463192 │
│ "app.bsky.feed.post" │ 895614 │ 327190 │
│ "app.bsky.feed.repost" │ 582487 │ 188549 │
│ "app.bsky.graph.block" │ 139918 │ 45808 │
│ "app.bsky.graph.listitem" │ 72005 │ 7097 │
│ "app.bsky.actor.profile" │ 57530 │ 54333 │
│ "app.bsky.graph.listblock" │ 9994 │ 5104 │
│ "app.bsky.graph.list" │ 2495 │ 1954 │
│ "app.bsky.feed.threadgate" │ 2295 │ 1608 │
│ "app.bsky.feed.postgate" │ 1052 │ 646 │
│ "app.bsky.graph.starterpack" │ 905 │ 840 │
│ "app.bsky.feed.generator" │ 150 │ 74 │
└──────────────────────────────┴─────────┴────────┘

------------------------------------------------------------------------------------------------------------------------
Result for query Q3:

┌─event──────────────────┬─hour_of_day─┬─count───┐
│ "app.bsky.feed.like" │ 16 │ 1024657 │
│ "app.bsky.feed.post" │ 16 │ 201824 │
│ "app.bsky.feed.repost" │ 16 │ 133098 │
│ "app.bsky.feed.like" │ 17 │ 2170794 │
│ "app.bsky.feed.post" │ 17 │ 423474 │
│ "app.bsky.feed.repost" │ 17 │ 270421 │
│ "app.bsky.feed.like" │ 18 │ 1435884 │
│ "app.bsky.feed.post" │ 18 │ 270316 │
│ "app.bsky.feed.repost" │ 18 │ 178968 │
└────────────────────────┴─────────────┴─────────┘

------------------------------------------------------------------------------------------------------------------------
Result for query Q4:

┌─user_id────────────────────────────┬─first_post_ts────┐
│ "did:plc:yj3sjq3blzpynh27cumnp5ks" │ 1732206349000167 │
│ "did:plc:l5o3qjrmfztir54cpwlv2eme" │ 1732206349001905 │
│ "did:plc:s4bwqchfzm6gjqfeb6mexgbu" │ 1732206349003907 │
└────────────────────────────────────┴──────────────────┘

------------------------------------------------------------------------------------------------------------------------
Result for query Q5:

┌─user_id────────────────────────────┬─activity_span─┐
│ "did:plc:doxhhgtxqiv47tmcovpbcqai" │ 7902.0 │
│ "did:plc:oqmcuym3iqvl5cpaxjw4idom" │ 7901.0 │
│ "did:plc:k3aqnyog5k4xk4p6p2xxeirl" │ 7900.0 │
└────────────────────────────────────┴───────────────┘
3 changes: 0 additions & 3 deletions greptimedb/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,3 @@ do
done
echo "Started GreptimeDB."

# init pipeline
curl -s -XPOST 'http://localhost:4000/v1/events/pipelines/jsonbench' -F 'file=@pipeline.yaml'
echo -e "\nPipeline initialized."