From 5e62f6a72da7c37e1f396302ea697d2a6ce26734 Mon Sep 17 00:00:00 2001 From: luofucong Date: Tue, 2 Jun 2026 10:41:30 +0800 Subject: [PATCH] test greptimedb's new json implementation --- greptimedb/compare_query_results.sh | 28 +++++++ greptimedb/create.sh | 11 +++ greptimedb/ddl.sql | 7 ++ greptimedb/load_data.sh | 4 +- greptimedb/main.sh | 17 ++++- greptimedb/pipeline.yaml | 37 ++++------ greptimedb/queries.sql | 10 +-- greptimedb/queries_formatted.sql | 65 +++++++++-------- greptimedb/query_results.sh | 36 +++++++++ .../results/_query_results/expected_10m | 73 +++++++++++++++++++ greptimedb/start.sh | 3 - 11 files changed, 226 insertions(+), 65 deletions(-) create mode 100755 greptimedb/compare_query_results.sh create mode 100755 greptimedb/create.sh create mode 100644 greptimedb/ddl.sql create mode 100755 greptimedb/query_results.sh create mode 100644 greptimedb/results/_query_results/expected_10m diff --git a/greptimedb/compare_query_results.sh b/greptimedb/compare_query_results.sh new file mode 100755 index 0000000..1e5c4e0 --- /dev/null +++ b/greptimedb/compare_query_results.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +DATASET="$1" +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +EXPECTED_FILE="${SCRIPT_DIR}/results/_query_results/expected_${DATASET}" +ACTUAL_FILE="${SCRIPT_DIR}/results/_query_results/actual_${DATASET}" + +if [[ ! -f "$EXPECTED_FILE" ]]; then + echo "Error: expected query results file not found: $EXPECTED_FILE" >&2 + exit 1 +fi + +echo "Comparing query results:" >&2 +echo " expected: $EXPECTED_FILE" >&2 +echo " actual: $ACTUAL_FILE" >&2 + +if ! cmp -s "$EXPECTED_FILE" "$ACTUAL_FILE"; then + echo "Error: actual query results differ from expected query results." >&2 + diff -u "$EXPECTED_FILE" "$ACTUAL_FILE" >&2 || true + exit 1 +fi + +echo "Query results match expected output." >&2 diff --git a/greptimedb/create.sh b/greptimedb/create.sh new file mode 100755 index 0000000..2937923 --- /dev/null +++ b/greptimedb/create.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "Creating pipeline \"jsonbench\" ..." +curl -i -X POST 'http://localhost:4000/v1/events/pipelines/jsonbench' -F 'file=@pipeline.yaml' + +echo "Creating table \"bluesky\" ..." +DDL_SQL=$(tr '\n' ' ' < ddl.sql) +curl -i -X POST -H 'Content-Type: application/x-www-form-urlencoded' \ + http://localhost:4000/v1/sql \ + -d "sql=$DDL_SQL" \ + -d "format=json" diff --git a/greptimedb/ddl.sql b/greptimedb/ddl.sql new file mode 100644 index 0000000..04e96ae --- /dev/null +++ b/greptimedb/ddl.sql @@ -0,0 +1,7 @@ +CREATE TABLE bluesky ( + "data" JSON2, + time_us TimestampMicrosecond TIME INDEX +) WITH ( + 'append_mode' = 'true', + 'sst_format' = 'flat' +) diff --git a/greptimedb/load_data.sh b/greptimedb/load_data.sh index aa20914..0490ae6 100755 --- a/greptimedb/load_data.sh +++ b/greptimedb/load_data.sh @@ -21,8 +21,8 @@ counter=0 for file in $(ls *.json.gz | head -n $MAX_FILES); do echo "Processing file: $file" - curl "http://localhost:4000/v1/events/logs?table=bluesky&pipeline_name=jsonbench&ignore_errors=true" \ - -H "Content-Type: application/x-ndjson" \ + curl -i "http://localhost:4000/v1/ingest?table=bluesky&pipeline_name=jsonbench&skip_error=true" \ + -H "Content-Type: text/plain" \ -H "Content-Encoding: gzip" \ --data-binary @$file echo "" diff --git a/greptimedb/main.sh b/greptimedb/main.sh index 200849a..0228e08 100755 --- a/greptimedb/main.sh +++ b/greptimedb/main.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -o pipefail + DEFAULT_CHOICE=ask DEFAULT_DATA_DIRECTORY=~/data/bluesky @@ -16,6 +18,14 @@ ERROR_LOG="${4:-error.log}" # Define prefix for output files OUTPUT_PREFIX="${5:-_m6i.8xlarge}" +# Run install.sh unless explicitly disabled. +INSTALL="${6:-true}" + +if [[ "$INSTALL" != "true" && "$INSTALL" != "false" ]]; then + echo "Error: install must be either 'true' or 'false'." + exit 1 +fi + # Check if the directory exists if [[ ! -d "$DATA_DIRECTORY" ]]; then echo "Error: Data directory '$DATA_DIRECTORY' does not exist." @@ -32,7 +42,9 @@ if [ "$CHOICE" = "ask" ]; then read -p "Enter the number corresponding to your choice: " CHOICE fi -./install.sh +if [[ "$INSTALL" == "true" ]]; then + ./install.sh +fi benchmark() { local size=$1 @@ -44,12 +56,15 @@ benchmark() { fi ./start.sh + ./create.sh ./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG" ./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size" ./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size" ./index_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.index_size" ./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count" ./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" + ./query_results.sh "${size}m" | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results" + ./compare_query_results.sh "${size}m" || return 1 ./drop_tables.sh } diff --git a/greptimedb/pipeline.yaml b/greptimedb/pipeline.yaml index 0ce1655..7e375f2 100644 --- a/greptimedb/pipeline.yaml +++ b/greptimedb/pipeline.yaml @@ -1,34 +1,25 @@ +version: 2 + processors: + - json_parse: + fields: + - message, data + ignore_missing: true + - simple_extract: + fields: + - data, time_us + key: "time_us" + ignore_missing: false - epoch: fields: - time_us resolution: microsecond - - simple_extract: - fields: - - commit, commit_collection - key: "collection" - ignore_missing: true - - simple_extract: + - select: fields: - - commit, commit_operation - key: "operation" - ignore_missing: true + - time_us + - data transform: - - fields: - - did - type: string - - fields: - - kind - - commit_collection - - commit_operation - type: string - index: inverted - tag: true - - fields: - - commit - type: json - on_failure: ignore - fields: - time_us type: epoch, us diff --git a/greptimedb/queries.sql b/greptimedb/queries.sql index bfecb84..adadf5a 100644 --- a/greptimedb/queries.sql +++ b/greptimedb/queries.sql @@ -1,5 +1,5 @@ -SELECT commit_collection AS event, count(1) AS cnt FROM bluesky GROUP BY event ORDER BY cnt DESC; -SELECT commit_collection AS event, count(1) AS cnt, count(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' GROUP BY event ORDER BY cnt DESC; -SELECT commit_collection AS event, date_part('hour', time_us) AS hour_of_day, count(1) AS cnt FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; -SELECT did AS user_id, min(time_us) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3; -SELECT did AS user_id, date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span FROM bluesky WHERE kind = 'commit' AND commit_operation = 'create' AND commit_collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3; +SELECT data.commit.collection AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC, event ASC; +SELECT data.commit.collection AS event, count() AS count, count(DISTINCT data.did) AS users FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' GROUP BY event ORDER BY count DESC, event ASC; +SELECT data.commit.collection AS event, date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day, count() AS count FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event; +SELECT data.did::String as user_id, min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC, user_id DESC LIMIT 3; +SELECT data.did::String as user_id, date_part('epoch', max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) - min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))) AS activity_span FROM bluesky WHERE data.kind = 'commit' AND data.commit.operation = 'create' AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC, user_id DESC LIMIT 3; diff --git a/greptimedb/queries_formatted.sql b/greptimedb/queries_formatted.sql index 7db5eed..6a15b45 100644 --- a/greptimedb/queries_formatted.sql +++ b/greptimedb/queries_formatted.sql @@ -1,59 +1,62 @@ ------------------------------------------------------------------------------------------------------------------------ -- Q1 - Top event types ------------------------------------------------------------------------------------------------------------------------ -SELECT commit_collection AS event, - count(1) AS cnt +SELECT data.commit.collection AS event, + count() AS count FROM bluesky GROUP BY event -ORDER BY cnt DESC; +ORDER BY count DESC, event ASC; ------------------------------------------------------------------------------------------------------------------------ -- Q2 - Top event types together with unique users per event type ------------------------------------------------------------------------------------------------------------------------ -SELECT commit_collection AS event, - count(1) AS cnt, - count(DISTINCT did) AS users +SELECT data.commit.collection AS event, + count() AS count, + count(DISTINCT data.did) AS users FROM bluesky -WHERE kind = 'commit' - AND commit_operation = 'create' +WHERE data.kind = 'commit' AND data.commit.operation = 'create' GROUP BY event -ORDER BY cnt DESC; +ORDER BY count DESC, event ASC; ------------------------------------------------------------------------------------------------------------------------ -- Q3 - When do people use BlueSky ------------------------------------------------------------------------------------------------------------------------ -SELECT commit_collection AS event, - date_part('hour', time_us) AS hour_of_day, - count(1) AS cnt +SELECT data.did::String as user_id, + min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts FROM bluesky -WHERE kind = 'commit' - AND commit_operation = 'create' - AND commit_collection IN('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') -GROUP BY event, - hour_of_day -ORDER BY hour_of_day, - event; +WHERE data.kind = 'commit' + AND data.commit.operation = 'create' + AND data.commit.collection = 'app.bsky.feed.post' +GROUP BY user_id +ORDER BY first_post_ts ASC, user_id DESC +LIMIT 3; ------------------------------------------------------------------------------------------------------------------------ -- Q4 - top 3 post veterans ------------------------------------------------------------------------------------------------------------------------ -SELECT did AS user_id, - min(time_us) AS first_post_ts +SELECT data.did::String as user_id, + min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts FROM bluesky -WHERE kind = 'commit' - AND commit_operation = 'create' - AND commit_collection = 'app.bsky.feed.post' +WHERE data.kind = 'commit' + AND data.commit.operation = 'create' + AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id -ORDER BY first_post_ts ASC LIMIT 3; +ORDER BY first_post_ts ASC, user_id DESC +LIMIT 3; ------------------------------------------------------------------------------------------------------------------------ -- Q5 - top 3 users with longest activity ------------------------------------------------------------------------------------------------------------------------ -SELECT did AS user_id, - date_part('millisecond',(max(time_us) - min(time_us))) AS activity_span +SELECT data.did::String as user_id, + date_part( + 'epoch', + max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) - + min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) + ) AS activity_span FROM bluesky -WHERE kind = 'commit' - AND commit_operation = 'create' - AND commit_collection = 'app.bsky.feed.post' +WHERE data.kind = 'commit' + AND data.commit.operation = 'create' + AND data.commit.collection = 'app.bsky.feed.post' GROUP BY user_id -ORDER BY activity_span DESC LIMIT 3; +ORDER BY activity_span DESC, user_id DESC +LIMIT 3; diff --git a/greptimedb/query_results.sh b/greptimedb/query_results.sh new file mode 100755 index 0000000..0a57691 --- /dev/null +++ b/greptimedb/query_results.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +DATASET="$1" +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +OUTPUT_FILE="${SCRIPT_DIR}/results/_query_results/actual_${DATASET}" + +mkdir -p "$(dirname "$OUTPUT_FILE")" + +QUERY_NUM=1 + +set -f +{ +while IFS= read -r query; do + if [[ "$QUERY_NUM" != "1" ]]; then + echo + fi + + echo "------------------------------------------------------------------------------------------------------------------------" + echo "Result for query Q$QUERY_NUM:" + echo + + curl -s --fail "http://localhost:4000/v1/sql?db=public&format=table" \ + --data-urlencode "sql=$query" + exit_code=$? + if [[ "$exit_code" != "0" ]]; then + echo "Error: query Q$QUERY_NUM failed with exit code $exit_code" + fi + + QUERY_NUM=$((QUERY_NUM + 1)) +done < "${SCRIPT_DIR}/queries.sql" +} | tee "$OUTPUT_FILE" diff --git a/greptimedb/results/_query_results/expected_10m b/greptimedb/results/_query_results/expected_10m new file mode 100644 index 0000000..c2ee672 --- /dev/null +++ b/greptimedb/results/_query_results/expected_10m @@ -0,0 +1,73 @@ +------------------------------------------------------------------------------------------------------------------------ +Result for query Q1: + +┌─event────────────────────────┬─count───┐ +│ "app.bsky.feed.like" │ 4673017 │ +│ "app.bsky.graph.follow" │ 3387006 │ +│ "app.bsky.feed.post" │ 930269 │ +│ "app.bsky.feed.repost" │ 597193 │ +│ "app.bsky.graph.block" │ 142218 │ +│ "app.bsky.actor.profile" │ 118646 │ +│ "app.bsky.graph.listitem" │ 75984 │ +│ null │ 53826 │ +│ "app.bsky.graph.listblock" │ 10467 │ +│ "app.bsky.graph.starterpack" │ 3619 │ +│ "app.bsky.graph.list" │ 3414 │ +│ "app.bsky.feed.threadgate" │ 2554 │ +│ "app.bsky.feed.postgate" │ 1076 │ +│ "app.bsky.feed.generator" │ 686 │ +│ "app.bsky.labeler.service" │ 13 │ +│ "app.kollective.catalog" │ 6 │ +└──────────────────────────────┴─────────┘ + +------------------------------------------------------------------------------------------------------------------------ +Result for query Q2: + +┌─event────────────────────────┬─count───┬─users──┐ +│ "app.bsky.feed.like" │ 4631335 │ 694849 │ +│ "app.bsky.graph.follow" │ 3220534 │ 463192 │ +│ "app.bsky.feed.post" │ 895614 │ 327190 │ +│ "app.bsky.feed.repost" │ 582487 │ 188549 │ +│ "app.bsky.graph.block" │ 139918 │ 45808 │ +│ "app.bsky.graph.listitem" │ 72005 │ 7097 │ +│ "app.bsky.actor.profile" │ 57530 │ 54333 │ +│ "app.bsky.graph.listblock" │ 9994 │ 5104 │ +│ "app.bsky.graph.list" │ 2495 │ 1954 │ +│ "app.bsky.feed.threadgate" │ 2295 │ 1608 │ +│ "app.bsky.feed.postgate" │ 1052 │ 646 │ +│ "app.bsky.graph.starterpack" │ 905 │ 840 │ +│ "app.bsky.feed.generator" │ 150 │ 74 │ +└──────────────────────────────┴─────────┴────────┘ + +------------------------------------------------------------------------------------------------------------------------ +Result for query Q3: + +┌─event──────────────────┬─hour_of_day─┬─count───┐ +│ "app.bsky.feed.like" │ 16 │ 1024657 │ +│ "app.bsky.feed.post" │ 16 │ 201824 │ +│ "app.bsky.feed.repost" │ 16 │ 133098 │ +│ "app.bsky.feed.like" │ 17 │ 2170794 │ +│ "app.bsky.feed.post" │ 17 │ 423474 │ +│ "app.bsky.feed.repost" │ 17 │ 270421 │ +│ "app.bsky.feed.like" │ 18 │ 1435884 │ +│ "app.bsky.feed.post" │ 18 │ 270316 │ +│ "app.bsky.feed.repost" │ 18 │ 178968 │ +└────────────────────────┴─────────────┴─────────┘ + +------------------------------------------------------------------------------------------------------------------------ +Result for query Q4: + +┌─user_id────────────────────────────┬─first_post_ts────┐ +│ "did:plc:yj3sjq3blzpynh27cumnp5ks" │ 1732206349000167 │ +│ "did:plc:l5o3qjrmfztir54cpwlv2eme" │ 1732206349001905 │ +│ "did:plc:s4bwqchfzm6gjqfeb6mexgbu" │ 1732206349003907 │ +└────────────────────────────────────┴──────────────────┘ + +------------------------------------------------------------------------------------------------------------------------ +Result for query Q5: + +┌─user_id────────────────────────────┬─activity_span─┐ +│ "did:plc:doxhhgtxqiv47tmcovpbcqai" │ 7902.0 │ +│ "did:plc:oqmcuym3iqvl5cpaxjw4idom" │ 7901.0 │ +│ "did:plc:k3aqnyog5k4xk4p6p2xxeirl" │ 7900.0 │ +└────────────────────────────────────┴───────────────┘ diff --git a/greptimedb/start.sh b/greptimedb/start.sh index b6020ca..4b4e89c 100755 --- a/greptimedb/start.sh +++ b/greptimedb/start.sh @@ -21,6 +21,3 @@ do done echo "Started GreptimeDB." -# init pipeline -curl -s -XPOST 'http://localhost:4000/v1/events/pipelines/jsonbench' -F 'file=@pipeline.yaml' -echo -e "\nPipeline initialized."