apache · kokila-19 · May 11, 2026 · Jun 7, 2026
diff --git a/hplsql/pom.xml b/hplsql/pom.xml
@@ -106,6 +106,10 @@
       <artifactId>junit</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.jetbrains</groupId>
+      <artifactId>annotations</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.junit.jupiter</groupId>
       <artifactId>junit-jupiter-engine</artifactId>

diff --git a/pom.xml b/pom.xml
@@ -164,6 +164,7 @@
     <jettison.version>1.5.4</jettison.version>
     <jetty.version>9.4.57.v20241219</jetty.version>
     <jersey.version>1.19.4</jersey.version>
+    <jetbrains-annotations.version>26.0.2</jetbrains-annotations.version>
     <!-- HIVE-28992: only upgrade to newer than 3.25.0 if you tested the prompt -->
     <jline.version>3.25.0</jline.version>
     <jms.version>2.0.2</jms.version>
@@ -187,7 +188,7 @@
     <postgres.version>42.7.3</postgres.version>
     <oracle.version>21.3.0.0</oracle.version>
     <opencsv.version>5.9</opencsv.version>
-    <orc.version>2.1.2</orc.version>
+    <orc.version>2.3.0</orc.version>
     <otel.version>1.60.1</otel.version>
     <mockito-core.version>5.17.0</mockito-core.version>
     <mockito-inline.version>5.2.0</mockito-inline.version>
@@ -473,6 +474,11 @@
         <artifactId>javolution</artifactId>
         <version>${javolution.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.jetbrains</groupId>
+        <artifactId>annotations</artifactId>
+        <version>${jetbrains-annotations.version}</version>
+      </dependency>
       <dependency>
         <groupId>org.jline</groupId>
         <artifactId>jline</artifactId>

diff --git a/ql/pom.xml b/ql/pom.xml
@@ -424,6 +424,10 @@
       <groupId>org.codehaus.groovy</groupId>
       <artifactId>groovy-all</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.jetbrains</groupId>
+      <artifactId>annotations</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.jodd</groupId>
       <artifactId>jodd-util</artifactId>

diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFixAcidKeyIndex.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFixAcidKeyIndex.java
@@ -32,6 +32,7 @@
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
+import org.apache.orc.OrcConf;
 import org.apache.orc.OrcFile.WriterContext;
 import org.apache.orc.impl.OrcAcidUtils;
 import org.junit.Before;
@@ -86,6 +87,8 @@ void createTestAcidFile(Path path,
     FileSystem fs = path.getFileSystem(conf);
     fs.delete(path, true);
     TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeStr);
+    OrcConf.STRIPE_SIZE_CHECKRATIO.setDouble(conf, 0);
+    OrcConf.STRIPE_ROW_COUNT.setLong(conf, 5000);
     Writer writer = OrcFile.createWriter(path,
         OrcFile.writerOptions(conf)
             .fileSystem(fs)

diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java
@@ -104,6 +104,7 @@
 import org.apache.hadoop.hive.serde2.objectinspector.StructField;
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
 import org.apache.hadoop.hive.shims.CombineHiveKey;
@@ -2624,7 +2625,7 @@ public void testCombinationInputFormatWithAcid() throws Exception {
     assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00000",
         split.getPath().toString());
     assertEquals(0, split.getStart());
-    assertEquals(784, split.getLength());
+    assertEquals(791, split.getLength());
     split = (HiveInputFormat.HiveInputSplit) splits[1];
     assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
         split.inputFormatClassName());
@@ -4172,44 +4173,64 @@ public void testAcidReadPastLastStripeOffset() throws Exception {
       // d
       ((BytesColumnVector) scv.fields[2]).setVal(r,
           Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
-      indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION,
-          1, (int)(((LongColumnVector) batch.cols[2]).vector[0]), r);
     }
 
-    // Minimum 5000 rows per stripe.
+    // Match OrcRecordUpdater: addKey before rows are written. If addKey runs after addRowBatch,
+    // a mid-batch flush can snapshot the index with the previous statement id while the stripe
+    // already contains rows from the current batch.
     for (int idx = 0; idx < 8; ++idx) {
-      writer.addRowBatch(batch);
-      // bucket
-      batch.cols[2].isRepeating = true;
-      ((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(new AcidOutputFormat
-          .Options(conf).bucket(0).statementId(idx + 1));
-      for(long row_id : ((LongColumnVector) batch.cols[3]).vector) {
-        indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION,
-            1, (int)(((LongColumnVector) batch.cols[2]).vector[0]), row_id);
+      final int bucketForBatch = (int) ((LongColumnVector) batch.cols[2]).vector[0];
+      for (long row_id : ((LongColumnVector) batch.cols[3]).vector) {
+        indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION, 1, bucketForBatch, row_id);
       }
+      writer.addRowBatch(batch);
+      ((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(
+          new AcidOutputFormat.Options(conf).bucket(0).statementId(idx + 1));
     }
     writer.close();
     long fileLength = fs.getFileStatus(testFilePath).getLen();
 
     // Find the last stripe.
     List<StripeInformation> stripes;
     RecordIdentifier[] keyIndex;
-    try (Reader orcReader = OrcFile.createReader(fs, testFilePath)) {
+    try (Reader orcReader = OrcFile.createReader(fs, testFilePath);
+        RecordReader rr = orcReader.rows()) {
       stripes = orcReader.getStripes();
       keyIndex = OrcRecordUpdater.parseKeyIndex(orcReader);
+
+      StructObjectInspector soi = (StructObjectInspector) orcReader.getObjectInspector();
+      List<? extends StructField> structFields = soi.getAllStructFieldRefs();
+      StructField transactionField = structFields.get(1);
+      LongObjectInspector transactionOI =
+          (LongObjectInspector) transactionField.getFieldObjectInspector();
+      StructField bucketField = structFields.get(2);
+      IntObjectInspector bucketOI =
+          (IntObjectInspector) bucketField.getFieldObjectInspector();
+      StructField rowIdField = structFields.get(3);
+      LongObjectInspector rowIdOI =
+          (LongObjectInspector) rowIdField.getFieldObjectInspector();
+
+      Assert.assertEquals("Index length doesn't match number of stripes",
+          stripes.size(), keyIndex.length);
+      long rowsProcessed = 0;
+      for (int i = 0; i < stripes.size(); i++) {
+        rowsProcessed += stripes.get(i).getNumberOfRows();
+        rr.seekToRow(rowsProcessed - 1);
+        OrcStruct row = (OrcStruct) rr.next(null);
+        long lastTransaction =
+            transactionOI.get(soi.getStructFieldData(row, transactionField));
+        int lastBucket = bucketOI.get(soi.getStructFieldData(row, bucketField));
+        long lastRowId = rowIdOI.get(soi.getStructFieldData(row, rowIdField));
+        RecordIdentifier expected =
+            new RecordIdentifier(lastTransaction, lastBucket, lastRowId);
+        Assert.assertEquals("Index entry mismatch for stripe " + i, expected, keyIndex[i]);
+      }
     }
 
     StripeInformation lastStripe = stripes.get(stripes.size() - 1);
     long lastStripeOffset = lastStripe.getOffset();
     long lastStripeLength = lastStripe.getLength();
 
-    Assert.assertEquals("Index length doesn't match number of stripes",
-        stripes.size(), keyIndex.length);
-    Assert.assertEquals("1st Index entry mismatch",
-        new RecordIdentifier(1, 536870916, 999), keyIndex[0]);
-    Assert.assertEquals("2nd Index entry mismatch",
-        new RecordIdentifier(1, 536870920, 999), keyIndex[1]);
-
     // test with same schema with include
     conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
     conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");

diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFileStripeMergeRecordReader.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFileStripeMergeRecordReader.java
@@ -28,6 +28,8 @@
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.FileSplit;
+import org.apache.orc.CompressionKind;
+import org.apache.orc.OrcConf;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -37,7 +39,9 @@
 
 public class TestOrcFileStripeMergeRecordReader {
 
-  private static final int TEST_STRIPE_SIZE = 5000;
+  private static final int MAX_ROWS_PER_STRIPE = 5000;
+
+  private static final long STRIPE_SIZE_BYTES = 128;
 
   private OrcFileKeyWrapper key;
   private OrcFileValueWrapper value;
@@ -51,11 +55,16 @@ public class TestOrcFileStripeMergeRecordReader {
   @Before
   public void setup() throws IOException {
     conf = new Configuration();
+    // ORC ≥2.x: orc.stripe.size.check.ratio triggers flushes when buffered tree bytes exceed ratio × orc.stripe.size
+    // Setting it to 0 disables it.
+    OrcConf.STRIPE_SIZE_CHECKRATIO.setDouble(conf, 0);
+    // Maximum number of rows a Stripe can hold in ORC file.
+    OrcConf.STRIPE_ROW_COUNT.setLong(conf, MAX_ROWS_PER_STRIPE);
     fs = FileSystem.getLocal(conf);
     key = new OrcFileKeyWrapper();
     value = new OrcFileValueWrapper();
     tmpPath  = prepareTmpPath();
-    createOrcFile(TEST_STRIPE_SIZE, TEST_STRIPE_SIZE + 1);
+    createOrcFile(MAX_ROWS_PER_STRIPE + 1);
   }
 
   @After
@@ -86,7 +95,7 @@ public void testSplitStartsWithZeroOffset() throws IOException {
     // both stripes will be processed, first stripe has 5000 rows and second stripe has 1 row
     reader.next(key, value);
     Assert.assertEquals("InputPath", tmpPath, key.getInputPath());
-    Assert.assertEquals("NumberOfValues", TEST_STRIPE_SIZE,
+    Assert.assertEquals("NumberOfValues", MAX_ROWS_PER_STRIPE,
         value.getStripeStatistics().getColStats(0).getNumberOfValues());
     reader.next(key, value);
     Assert.assertEquals("InputPath", tmpPath, key.getInputPath());
@@ -96,7 +105,7 @@ public void testSplitStartsWithZeroOffset() throws IOException {
     reader.close();
   }
 
-  private void createOrcFile(int stripSize, int numberOfRows) throws IOException {
+  private void createOrcFile(int numberOfRows) throws IOException {
     ObjectInspector inspector;
     synchronized (TestOrcFileStripeMergeRecordReader.class) {
       inspector = ObjectInspectorFactory.getReflectionObjectInspector
@@ -106,7 +115,7 @@ private void createOrcFile(int stripSize, int numberOfRows) throws IOException {
     Writer writer = OrcFile.createWriter(tmpPath,
         OrcFile.writerOptions(conf)
             .inspector(inspector)
-            .stripeSize(stripSize)
+            .stripeSize(STRIPE_SIZE_BYTES)
             .compress(CompressionKind.ZLIB)
             .bufferSize(5000)
             .rowIndexStride(1000));

diff --git a/ql/src/test/queries/clientpositive/orc_merge10.q b/ql/src/test/queries/clientpositive/orc_merge10.q
@@ -1,6 +1,7 @@
 --! qt:dataset:src
 --! qt:dataset:part
 --! qt:replace:/(File Version:)(.+)/$1#Masked#/
+--! qt:replace:/(File length:\s+)\S+(\s+bytes)/$1#Masked#$2/
 set hive.vectorized.execution.enabled=false;
 set hive.compute.query.using.stats=false;
 set hive.mapred.mode=nonstrict;

diff --git a/ql/src/test/results/clientpositive/llap/explainanalyze_acid_with_direct_insert.q.out b/ql/src/test/results/clientpositive/llap/explainanalyze_acid_with_direct_insert.q.out
@@ -825,17 +825,17 @@ STAGE PLANS:
                 TableScan
                   alias: analyze_part_table
                   filterExpr: (b = 1) (type: boolean)
-                  Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                   Select Operator
                     expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
                     outputColumnNames: _col0
-                    Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                    Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                     Reduce Output Operator
                       key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
                       null sort order: z
                       sort order: +
                       Map-reduce partition columns: UDFToInteger(_col0) (type: int)
-                      Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                       value expressions: 1 (type: int)
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
@@ -845,10 +845,10 @@ STAGE PLANS:
               Select Operator
                 expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: int)
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                   table:
                       input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
                       output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
@@ -1795,17 +1795,17 @@ STAGE PLANS:
                 TableScan
                   alias: analyze_part_table
                   filterExpr: (b = 1) (type: boolean)
-                  Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                   Select Operator
                     expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
                     outputColumnNames: _col0
-                    Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                    Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                     Reduce Output Operator
                       key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
                       null sort order: z
                       sort order: +
                       Map-reduce partition columns: UDFToInteger(_col0) (type: int)
-                      Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                      Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                       value expressions: 1 (type: int)
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
@@ -1815,10 +1815,10 @@ STAGE PLANS:
               Select Operator
                 expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: int)
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
                   table:
                       input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
                       output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat

diff --git a/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_10.q.out b/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_10.q.out
@@ -79,7 +79,7 @@ STAGE PLANS:
                 TableScan
                   alias: t1
                   filterExpr: (b = 1) (type: boolean)
-                  Statistics: Num rows: 69 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
+                  Statistics: Num rows: 70 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
                     predicate: (b = 1) (type: boolean)
                     Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE

diff --git a/ql/src/test/results/clientpositive/llap/orc_merge10.q.out b/ql/src/test/results/clientpositive/llap/orc_merge10.q.out
@@ -761,22 +761,22 @@ Stripe Statistics:
   Stripe 1:
     Column 0: count: 242 hasNull: false
     Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
-    Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
+    Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646
 
 File Statistics:
   Column 0: count: 242 hasNull: false
   Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
-  Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
+  Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646
 
 Stripes:
-  Stripe: offset: 3 data: 1399 rows: 242 tail: 73 index: 77
+  Stripe: offset: 3 data: 1402 rows: 242 tail: 73 index: 77
     Stream: column 0 section ROW_INDEX start: 3 length 12
     Stream: column 1 section ROW_INDEX start: 15 length 28
     Stream: column 2 section ROW_INDEX start: 43 length 37
     Stream: column 1 section DATA start: 80 length 489
     Stream: column 2 section DATA start: 569 length 247
     Stream: column 2 section LENGTH start: 816 length 71
-    Stream: column 2 section DICTIONARY_DATA start: 887 length 592
+    Stream: column 2 section DICTIONARY_DATA start: 887 length 595
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DICTIONARY_V2[153]
@@ -787,7 +787,7 @@ Stripes:
     Row group indices for column 2:
       Entry 0: count: 242 hasNull: false min: val_0 max: val_97 sum: 1646 positions: 0,0,0
 
-File length: 1763 bytes
+File length: #Masked# bytes
 File raw data size: #Masked#
 Padding length: 0 bytes
 Padding ratio: 0%
@@ -813,22 +813,22 @@ Stripe Statistics:
   Stripe 1:
     Column 0: count: 242 hasNull: false
     Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
-    Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
+    Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646
 
 File Statistics:
   Column 0: count: 242 hasNull: false
   Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
-  Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
+  Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646
 
 Stripes:
-  Stripe: offset: 3 data: 1399 rows: 242 tail: 73 index: 77
+  Stripe: offset: 3 data: 1402 rows: 242 tail: 73 index: 77
     Stream: column 0 section ROW_INDEX start: 3 length 12
     Stream: column 1 section ROW_INDEX start: 15 length 28
     Stream: column 2 section ROW_INDEX start: 43 length 37
     Stream: column 1 section DATA start: 80 length 489
     Stream: column 2 section DATA start: 569 length 247
     Stream: column 2 section LENGTH start: 816 length 71
-    Stream: column 2 section DICTIONARY_DATA start: 887 length 592
+    Stream: column 2 section DICTIONARY_DATA start: 887 length 595
     Encoding column 0: DIRECT
     Encoding column 1: DIRECT_V2
     Encoding column 2: DICTIONARY_V2[153]
@@ -839,7 +839,7 @@ Stripes:
     Row group indices for column 2:
       Entry 0: count: 242 hasNull: false min: val_0 max: val_97 sum: 1646 positions: 0,0,0
 
-File length: 1763 bytes
+File length: #Masked# bytes
 File raw data size: #Masked#
 Padding length: 0 bytes
 Padding ratio: 0%