-
Notifications
You must be signed in to change notification settings - Fork 1.3k
[flink] Add restore_as_latest procedure #8139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
1492902
1c523ac
b2a52ed
255ecf7
41aacdb
7b99d1f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,6 +107,7 @@ | |
| import static org.apache.paimon.partition.PartitionPredicate.createBinaryPartitions; | ||
| import static org.apache.paimon.partition.PartitionPredicate.createPartitionPredicate; | ||
| import static org.apache.paimon.utils.Preconditions.checkArgument; | ||
| import static org.apache.paimon.utils.Preconditions.checkNotNull; | ||
|
|
||
| /** | ||
| * Default implementation of {@link FileStoreCommit}. | ||
|
|
@@ -1164,6 +1165,160 @@ public boolean replaceManifestList( | |
| return commitSnapshotImpl(newSnapshot, emptyList()); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean restoreAsLatest(Snapshot targetSnapshot) { | ||
| Snapshot latest = | ||
| checkNotNull( | ||
| snapshotManager.latestSnapshot(), | ||
| "Latest snapshot is null, can not restore."); | ||
|
|
||
| Map<FileEntry.Identifier, ManifestEntry> latestEntries = new HashMap<>(); | ||
| FileEntry.mergeEntries( | ||
| manifestFile, | ||
| manifestList.readDataManifests(latest), | ||
| latestEntries, | ||
| options.scanManifestParallelism()); | ||
|
|
||
| latestEntries.entrySet().removeIf(entry -> entry.getValue().kind() != FileKind.ADD); | ||
|
|
||
| Map<FileEntry.Identifier, ManifestEntry> targetEntries = new HashMap<>(); | ||
| FileEntry.mergeEntries( | ||
| manifestFile, | ||
| manifestList.readDataManifests(targetSnapshot), | ||
| targetEntries, | ||
| options.scanManifestParallelism()); | ||
| targetEntries.entrySet().removeIf(entry -> entry.getValue().kind() != FileKind.ADD); | ||
|
|
||
| List<ManifestEntry> deltaFiles = new ArrayList<>(); | ||
| for (Map.Entry<FileEntry.Identifier, ManifestEntry> entry : latestEntries.entrySet()) { | ||
| if (!targetEntries.containsKey(entry.getKey())) { | ||
| ManifestEntry manifestEntry = entry.getValue(); | ||
| deltaFiles.add( | ||
| ManifestEntry.create( | ||
| FileKind.DELETE, | ||
| manifestEntry.partition(), | ||
| manifestEntry.bucket(), | ||
| manifestEntry.totalBuckets(), | ||
| manifestEntry.file())); | ||
| } | ||
| } | ||
| for (Map.Entry<FileEntry.Identifier, ManifestEntry> entry : targetEntries.entrySet()) { | ||
| if (!latestEntries.containsKey(entry.getKey())) { | ||
| ManifestEntry manifestEntry = entry.getValue(); | ||
| deltaFiles.add( | ||
| ManifestEntry.create( | ||
| FileKind.ADD, | ||
| manifestEntry.partition(), | ||
| manifestEntry.bucket(), | ||
| manifestEntry.totalBuckets(), | ||
| manifestEntry.file())); | ||
| } | ||
| } | ||
|
|
||
| Pair<String, Long> baseManifestList = | ||
| manifestList.write(manifestFile.write(new ArrayList<>(latestEntries.values()))); | ||
| Pair<String, Long> deltaManifestList = manifestList.write(manifestFile.write(deltaFiles)); | ||
| // For row-tracking tables nextRowId must stay monotonic: restoring an older snapshot must | ||
| // not move it backwards, otherwise new appends would reuse row ids already assigned by the | ||
| // snapshots between the target and the previous latest, breaking the global uniqueness of | ||
| // _ROW_ID. Keep the larger of the previous latest and the target nextRowId. | ||
| Long nextRowId = maxNextRowId(latest.nextRowId(), targetSnapshot.nextRowId()); | ||
| Snapshot newSnapshot = | ||
| new Snapshot( | ||
| latest.id() + 1, | ||
| targetSnapshot.schemaId(), | ||
| baseManifestList.getKey(), | ||
| baseManifestList.getRight(), | ||
| deltaManifestList.getKey(), | ||
| deltaManifestList.getRight(), | ||
| null, | ||
| null, | ||
| targetSnapshot.indexManifest(), | ||
| commitUser, | ||
| Long.MAX_VALUE, | ||
| CommitKind.OVERWRITE, | ||
| System.currentTimeMillis(), | ||
| targetSnapshot.totalRecordCount(), | ||
| recordCountAdd(deltaFiles) - recordCountDelete(deltaFiles), | ||
| null, | ||
| targetSnapshot.watermark(), | ||
| targetSnapshot.statistics(), | ||
| targetSnapshot.properties(), | ||
| nextRowId); | ||
|
|
||
| // The restore is an overwrite from the previous latest to the target, so the base files, | ||
| // delta files and index changes describe the transition the callbacks need. These are | ||
| // shared by the pre- and post-commit callbacks below. | ||
| List<SimpleFileEntry> baseFiles = | ||
| SimpleFileEntry.from(new ArrayList<>(latestEntries.values())); | ||
| List<IndexManifestEntry> indexChanges = restoreIndexChanges(latest, targetSnapshot); | ||
|
|
||
| // Like a regular commit, run the pre-commit callbacks before the snapshot becomes visible. | ||
| // They may veto the restore by throwing (e.g. a chain-table snapshot branch rejects a | ||
| // pure-DELETE overwrite that would drop a snapshot partition still anchoring delta | ||
| // partitions), in which case the restore snapshot is never created. | ||
| commitPreCallbacks.forEach( | ||
| callback -> callback.call(baseFiles, deltaFiles, indexChanges, newSnapshot)); | ||
|
|
||
| boolean success = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This restore path still bypasses the normal pre-commit callbacks. Regular commits call commitPreCallbacks before commitSnapshotImpl, and ChainTableCommitPreCallback uses that hook to reject unsafe pure-DELETE overwrite commits on the snapshot branch. restoreAsLatest also creates an OVERWRITE delta, so restoring a snapshot branch back to an older state can delete snapshot partitions that a normal overwrite would validate and potentially abort. Please run the same pre-callback path (with the restore base files, delta files, index changes, and new snapshot) before making the restore snapshot visible. |
||
| commitSnapshotImpl(newSnapshot, new ArrayList<>(PartitionEntry.merge(deltaFiles))); | ||
| if (success) { | ||
| // Notify the post-commit callbacks so external views stay in sync with the restored | ||
| // state (e.g. Iceberg compatibility metadata and chain-table overwrite handling). | ||
| CommitCallback.Context context = | ||
| new CommitCallback.Context( | ||
| baseFiles, | ||
| deltaFiles, | ||
| indexChanges, | ||
| newSnapshot, | ||
| newSnapshot.commitIdentifier()); | ||
| commitCallbacks.forEach(callback -> callback.call(context)); | ||
| } | ||
| return success; | ||
| } | ||
|
|
||
| /** | ||
| * Computes the index file changes between the previous latest snapshot and the restore target, | ||
| * mirroring how the data delta files are derived: entries that only exist in the previous | ||
| * latest are marked as {@link FileKind#DELETE}, entries that only exist in the target are kept | ||
| * as ADD. | ||
| */ | ||
| private List<IndexManifestEntry> restoreIndexChanges(Snapshot latest, Snapshot target) { | ||
| Set<IndexManifestEntry> latestIndexEntries = readIndexEntries(latest.indexManifest()); | ||
| Set<IndexManifestEntry> targetIndexEntries = readIndexEntries(target.indexManifest()); | ||
|
|
||
| List<IndexManifestEntry> indexChanges = new ArrayList<>(); | ||
| for (IndexManifestEntry entry : latestIndexEntries) { | ||
| if (!targetIndexEntries.contains(entry)) { | ||
| indexChanges.add(entry.toDeleteEntry()); | ||
| } | ||
| } | ||
| for (IndexManifestEntry entry : targetIndexEntries) { | ||
| if (!latestIndexEntries.contains(entry)) { | ||
| indexChanges.add(entry); | ||
| } | ||
| } | ||
| return indexChanges; | ||
| } | ||
|
|
||
| private Set<IndexManifestEntry> readIndexEntries(@Nullable String indexManifest) { | ||
| if (indexManifest == null) { | ||
| return Collections.emptySet(); | ||
| } | ||
| return new HashSet<>(indexManifestFile.read(indexManifest)); | ||
| } | ||
|
|
||
| @Nullable | ||
| private static Long maxNextRowId(@Nullable Long left, @Nullable Long right) { | ||
| if (left == null) { | ||
| return right; | ||
| } | ||
| if (right == null) { | ||
| return left; | ||
| } | ||
| return Math.max(left, right); | ||
| } | ||
|
|
||
| public void compactManifest() { | ||
| int retryCount = 0; | ||
| long startMillis = System.currentTimeMillis(); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The restore delta is built only from data-file identifier differences. For tables using deletion vectors or other index-manifest state, the logical contents can change while the data files stay the same; restoring across a DV-only delete/update would leave deltaFiles empty and deltaRecordCount as 0, even though targetSnapshot.indexManifest() and totalRecordCount differ. Streaming overwrite readers call readChanges() from the DELTA data manifests, and the streaming path does not load DV indexes, so such a restore can still be skipped by streaming readers. Please include the relevant index/DV changes in the restore transition (or otherwise make the overwrite read/delta counts handle index-only restores) instead of relying on the final indexManifest alone.