Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions hplsql/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jetbrains</groupId>
<artifactId>annotations</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
Expand Down
8 changes: 7 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@
<jettison.version>1.5.4</jettison.version>
<jetty.version>9.4.57.v20241219</jetty.version>
<jersey.version>1.19.4</jersey.version>
<jetbrains-annotations.version>26.0.2</jetbrains-annotations.version>
<!-- HIVE-28992: only upgrade to newer than 3.25.0 if you tested the prompt -->
<jline.version>3.25.0</jline.version>
<jms.version>2.0.2</jms.version>
Expand All @@ -187,7 +188,7 @@
<postgres.version>42.7.3</postgres.version>
<oracle.version>21.3.0.0</oracle.version>
<opencsv.version>5.9</opencsv.version>
<orc.version>2.1.2</orc.version>
<orc.version>2.3.0</orc.version>
<otel.version>1.60.1</otel.version>
<mockito-core.version>5.17.0</mockito-core.version>
<mockito-inline.version>5.2.0</mockito-inline.version>
Expand Down Expand Up @@ -473,6 +474,11 @@
<artifactId>javolution</artifactId>
<version>${javolution.version}</version>
</dependency>
<dependency>
<groupId>org.jetbrains</groupId>
<artifactId>annotations</artifactId>
<version>${jetbrains-annotations.version}</version>
</dependency>
<dependency>
<groupId>org.jline</groupId>
<artifactId>jline</artifactId>
Expand Down
4 changes: 4 additions & 0 deletions ql/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,10 @@
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
</dependency>
<dependency>
<groupId>org.jetbrains</groupId>
<artifactId>annotations</artifactId>
</dependency>
<dependency>
<groupId>org.jodd</groupId>
<artifactId>jodd-util</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.orc.OrcConf;
import org.apache.orc.OrcFile.WriterContext;
import org.apache.orc.impl.OrcAcidUtils;
import org.junit.Before;
Expand Down Expand Up @@ -86,6 +87,8 @@ void createTestAcidFile(Path path,
FileSystem fs = path.getFileSystem(conf);
fs.delete(path, true);
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeStr);
OrcConf.STRIPE_SIZE_CHECKRATIO.setDouble(conf, 0);
OrcConf.STRIPE_ROW_COUNT.setLong(conf, 5000);
Writer writer = OrcFile.createWriter(path,
OrcFile.writerOptions(conf)
.fileSystem(fs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.shims.CombineHiveKey;
Expand Down Expand Up @@ -2624,7 +2625,7 @@ public void testCombinationInputFormatWithAcid() throws Exception {
assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00000",
split.getPath().toString());
assertEquals(0, split.getStart());
assertEquals(784, split.getLength());
assertEquals(791, split.getLength());
split = (HiveInputFormat.HiveInputSplit) splits[1];
assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
split.inputFormatClassName());
Expand Down Expand Up @@ -4172,44 +4173,64 @@ public void testAcidReadPastLastStripeOffset() throws Exception {
// d
((BytesColumnVector) scv.fields[2]).setVal(r,
Integer.toHexString(r).getBytes(StandardCharsets.UTF_8));
indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION,
1, (int)(((LongColumnVector) batch.cols[2]).vector[0]), r);
}

// Minimum 5000 rows per stripe.
// Match OrcRecordUpdater: addKey before rows are written. If addKey runs after addRowBatch,
// a mid-batch flush can snapshot the index with the previous statement id while the stripe
// already contains rows from the current batch.
for (int idx = 0; idx < 8; ++idx) {
writer.addRowBatch(batch);
// bucket
batch.cols[2].isRepeating = true;
((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(new AcidOutputFormat
.Options(conf).bucket(0).statementId(idx + 1));
for(long row_id : ((LongColumnVector) batch.cols[3]).vector) {
indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION,
1, (int)(((LongColumnVector) batch.cols[2]).vector[0]), row_id);
final int bucketForBatch = (int) ((LongColumnVector) batch.cols[2]).vector[0];
for (long row_id : ((LongColumnVector) batch.cols[3]).vector) {
indexBuilder.addKey(OrcRecordUpdater.INSERT_OPERATION, 1, bucketForBatch, row_id);
}
writer.addRowBatch(batch);
((LongColumnVector) batch.cols[2]).vector[0] = BucketCodec.V1.encode(
new AcidOutputFormat.Options(conf).bucket(0).statementId(idx + 1));
}
writer.close();
long fileLength = fs.getFileStatus(testFilePath).getLen();

// Find the last stripe.
List<StripeInformation> stripes;
RecordIdentifier[] keyIndex;
try (Reader orcReader = OrcFile.createReader(fs, testFilePath)) {
try (Reader orcReader = OrcFile.createReader(fs, testFilePath);
RecordReader rr = orcReader.rows()) {
stripes = orcReader.getStripes();
keyIndex = OrcRecordUpdater.parseKeyIndex(orcReader);

StructObjectInspector soi = (StructObjectInspector) orcReader.getObjectInspector();
List<? extends StructField> structFields = soi.getAllStructFieldRefs();
StructField transactionField = structFields.get(1);
LongObjectInspector transactionOI =
(LongObjectInspector) transactionField.getFieldObjectInspector();
StructField bucketField = structFields.get(2);
IntObjectInspector bucketOI =
(IntObjectInspector) bucketField.getFieldObjectInspector();
StructField rowIdField = structFields.get(3);
LongObjectInspector rowIdOI =
(LongObjectInspector) rowIdField.getFieldObjectInspector();

Assert.assertEquals("Index length doesn't match number of stripes",
stripes.size(), keyIndex.length);
long rowsProcessed = 0;
for (int i = 0; i < stripes.size(); i++) {
rowsProcessed += stripes.get(i).getNumberOfRows();
rr.seekToRow(rowsProcessed - 1);
OrcStruct row = (OrcStruct) rr.next(null);
long lastTransaction =
transactionOI.get(soi.getStructFieldData(row, transactionField));
int lastBucket = bucketOI.get(soi.getStructFieldData(row, bucketField));
long lastRowId = rowIdOI.get(soi.getStructFieldData(row, rowIdField));
RecordIdentifier expected =
new RecordIdentifier(lastTransaction, lastBucket, lastRowId);
Assert.assertEquals("Index entry mismatch for stripe " + i, expected, keyIndex[i]);
}
}

StripeInformation lastStripe = stripes.get(stripes.size() - 1);
long lastStripeOffset = lastStripe.getOffset();
long lastStripeLength = lastStripe.getLength();

Assert.assertEquals("Index length doesn't match number of stripes",
stripes.size(), keyIndex.length);
Assert.assertEquals("1st Index entry mismatch",
new RecordIdentifier(1, 536870916, 999), keyIndex[0]);
Assert.assertEquals("2nd Index entry mismatch",
new RecordIdentifier(1, 536870920, 999), keyIndex[1]);

// test with same schema with include
conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:");
conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.orc.CompressionKind;
import org.apache.orc.OrcConf;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
Expand All @@ -37,7 +39,9 @@

public class TestOrcFileStripeMergeRecordReader {

private static final int TEST_STRIPE_SIZE = 5000;
private static final int MAX_ROWS_PER_STRIPE = 5000;

private static final long STRIPE_SIZE_BYTES = 128;

private OrcFileKeyWrapper key;
private OrcFileValueWrapper value;
Expand All @@ -51,11 +55,16 @@ public class TestOrcFileStripeMergeRecordReader {
@Before
public void setup() throws IOException {
conf = new Configuration();
// ORC ≥2.x: orc.stripe.size.check.ratio triggers flushes when buffered tree bytes exceed ratio × orc.stripe.size
// Setting it to 0 disables it.
OrcConf.STRIPE_SIZE_CHECKRATIO.setDouble(conf, 0);
// Maximum number of rows a Stripe can hold in ORC file.
OrcConf.STRIPE_ROW_COUNT.setLong(conf, MAX_ROWS_PER_STRIPE);
fs = FileSystem.getLocal(conf);
key = new OrcFileKeyWrapper();
value = new OrcFileValueWrapper();
tmpPath = prepareTmpPath();
createOrcFile(TEST_STRIPE_SIZE, TEST_STRIPE_SIZE + 1);
createOrcFile(MAX_ROWS_PER_STRIPE + 1);
}

@After
Expand Down Expand Up @@ -86,7 +95,7 @@ public void testSplitStartsWithZeroOffset() throws IOException {
// both stripes will be processed, first stripe has 5000 rows and second stripe has 1 row
reader.next(key, value);
Assert.assertEquals("InputPath", tmpPath, key.getInputPath());
Assert.assertEquals("NumberOfValues", TEST_STRIPE_SIZE,
Assert.assertEquals("NumberOfValues", MAX_ROWS_PER_STRIPE,
value.getStripeStatistics().getColStats(0).getNumberOfValues());
reader.next(key, value);
Assert.assertEquals("InputPath", tmpPath, key.getInputPath());
Expand All @@ -96,7 +105,7 @@ public void testSplitStartsWithZeroOffset() throws IOException {
reader.close();
}

private void createOrcFile(int stripSize, int numberOfRows) throws IOException {
private void createOrcFile(int numberOfRows) throws IOException {
ObjectInspector inspector;
synchronized (TestOrcFileStripeMergeRecordReader.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
Expand All @@ -106,7 +115,7 @@ private void createOrcFile(int stripSize, int numberOfRows) throws IOException {
Writer writer = OrcFile.createWriter(tmpPath,
OrcFile.writerOptions(conf)
.inspector(inspector)
.stripeSize(stripSize)
.stripeSize(STRIPE_SIZE_BYTES)
.compress(CompressionKind.ZLIB)
.bufferSize(5000)
.rowIndexStride(1000));
Expand Down
1 change: 1 addition & 0 deletions ql/src/test/queries/clientpositive/orc_merge10.q
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
--! qt:dataset:src
--! qt:dataset:part
--! qt:replace:/(File Version:)(.+)/$1#Masked#/
--! qt:replace:/(File length:\s+)\S+(\s+bytes)/$1#Masked#$2/
set hive.vectorized.execution.enabled=false;
set hive.compute.query.using.stats=false;
set hive.mapred.mode=nonstrict;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -825,17 +825,17 @@ STAGE PLANS:
TableScan
alias: analyze_part_table
filterExpr: (b = 1) (type: boolean)
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
outputColumnNames: _col0
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
null sort order: z
sort order: +
Map-reduce partition columns: UDFToInteger(_col0) (type: int)
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
value expressions: 1 (type: int)
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
Expand All @@ -845,10 +845,10 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: int)
outputColumnNames: _col0, _col1
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
Expand Down Expand Up @@ -1795,17 +1795,17 @@ STAGE PLANS:
TableScan
alias: analyze_part_table
filterExpr: (b = 1) (type: boolean)
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
outputColumnNames: _col0
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: _col0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
null sort order: z
sort order: +
Map-reduce partition columns: UDFToInteger(_col0) (type: int)
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
value expressions: 1 (type: int)
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
Expand All @@ -1815,10 +1815,10 @@ STAGE PLANS:
Select Operator
expressions: KEY.reducesinkkey0 (type: struct<writeid:bigint,bucketid:int,rowid:bigint>), VALUE._col0 (type: int)
outputColumnNames: _col0, _col1
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 2/2 Data size: 6376 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 2/2 Data size: 6386 Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ STAGE PLANS:
TableScan
alias: t1
filterExpr: (b = 1) (type: boolean)
Statistics: Num rows: 69 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 70 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: (b = 1) (type: boolean)
Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
Expand Down
20 changes: 10 additions & 10 deletions ql/src/test/results/clientpositive/llap/orc_merge10.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -761,22 +761,22 @@ Stripe Statistics:
Stripe 1:
Column 0: count: 242 hasNull: false
Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646

File Statistics:
Column 0: count: 242 hasNull: false
Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646

Stripes:
Stripe: offset: 3 data: 1399 rows: 242 tail: 73 index: 77
Stripe: offset: 3 data: 1402 rows: 242 tail: 73 index: 77
Stream: column 0 section ROW_INDEX start: 3 length 12
Stream: column 1 section ROW_INDEX start: 15 length 28
Stream: column 2 section ROW_INDEX start: 43 length 37
Stream: column 1 section DATA start: 80 length 489
Stream: column 2 section DATA start: 569 length 247
Stream: column 2 section LENGTH start: 816 length 71
Stream: column 2 section DICTIONARY_DATA start: 887 length 592
Stream: column 2 section DICTIONARY_DATA start: 887 length 595
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DICTIONARY_V2[153]
Expand All @@ -787,7 +787,7 @@ Stripes:
Row group indices for column 2:
Entry 0: count: 242 hasNull: false min: val_0 max: val_97 sum: 1646 positions: 0,0,0

File length: 1763 bytes
File length: #Masked# bytes
File raw data size: #Masked#
Padding length: 0 bytes
Padding ratio: 0%
Expand All @@ -813,22 +813,22 @@ Stripe Statistics:
Stripe 1:
Column 0: count: 242 hasNull: false
Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646

File Statistics:
Column 0: count: 242 hasNull: false
Column 1: count: 242 hasNull: false bytesOnDisk: 489 min: 0 max: 497 sum: 60770
Column 2: count: 242 hasNull: false bytesOnDisk: 910 min: val_0 max: val_97 sum: 1646
Column 2: count: 242 hasNull: false bytesOnDisk: 913 min: val_0 max: val_97 sum: 1646

Stripes:
Stripe: offset: 3 data: 1399 rows: 242 tail: 73 index: 77
Stripe: offset: 3 data: 1402 rows: 242 tail: 73 index: 77
Stream: column 0 section ROW_INDEX start: 3 length 12
Stream: column 1 section ROW_INDEX start: 15 length 28
Stream: column 2 section ROW_INDEX start: 43 length 37
Stream: column 1 section DATA start: 80 length 489
Stream: column 2 section DATA start: 569 length 247
Stream: column 2 section LENGTH start: 816 length 71
Stream: column 2 section DICTIONARY_DATA start: 887 length 592
Stream: column 2 section DICTIONARY_DATA start: 887 length 595
Encoding column 0: DIRECT
Encoding column 1: DIRECT_V2
Encoding column 2: DICTIONARY_V2[153]
Expand All @@ -839,7 +839,7 @@ Stripes:
Row group indices for column 2:
Entry 0: count: 242 hasNull: false min: val_0 max: val_97 sum: 1646 positions: 0,0,0

File length: 1763 bytes
File length: #Masked# bytes
File raw data size: #Masked#
Padding length: 0 bytes
Padding ratio: 0%
Expand Down
Loading
Loading