feat(backup): on-demand bitmap recreation for incremental NAS backup

CloudStack rebuilds the libvirt domain XML on every VM start, which means
persistent QEMU dirty bitmaps don't survive a stop/start cycle. Rather
than hooking into the VM start lifecycle (intrusive across the
orchestration layer), this commit handles the missing bitmap *lazily* at
the next backup attempt:

  nasbackup.sh
    - When -M incremental is requested, the script first checks
      `virsh checkpoint-list` for the parent bitmap. If absent, it
      recreates the checkpoint on the running domain so libvirt accepts
      the <incremental> reference. The next incremental will be larger
      than usual (it captures all writes since recreate, not since the
      previous incremental) but is correct; subsequent ones return to
      normal size.
    - On recreation, emits BITMAP_RECREATED=<name> on stdout for the
      orchestrator to record.

  BackupAnswer
    + bitmapRecreated field surfaced from the agent.

  LibvirtTakeBackupCommandWrapper
    - Strips BITMAP_RECREATED= line from stdout before size parsing.
    - Sets answer.setBitmapRecreated(...).

  NASBackupChainKeys
    + BITMAP_RECREATED key for backup_details.

  NASBackupProvider
    - When the agent reports a recreated bitmap, persists it under
      backup_details and logs an info-level message so operators can
      correlate larger-than-usual incrementals with VM restarts.

This satisfies the bitmap-loss-on-VM-restart concern from the RFC review
without touching VirtualMachineManager / StartCommand / agent lifecycle.

Refs: apache/cloudstack#12899
This commit is contained in:
James Peru 2026-04-27 19:10:46 +03:00
parent 1f2aebca36
commit 43e2f7504a
5 changed files with 51 additions and 0 deletions

View File

@ -35,6 +35,11 @@ public class BackupAnswer extends Answer {
// Set when an incremental was requested but the agent had to fall back to a full
// (e.g. VM was stopped). Provider should record this backup as type=full.
private Boolean incrementalFallback;
// Set when the agent had to recreate the parent bitmap before this incremental
// (e.g. CloudStack rebuilt the domain XML on the previous VM start, losing bitmaps).
// The first incremental after a recreate is larger than usual; subsequent
// incrementals return to normal size. Informational recorded in backup_details.
private String bitmapRecreated;
public BackupAnswer(final Command command, final boolean success, final String details) {
super(command, success, details);
@ -90,4 +95,12 @@ public class BackupAnswer extends Answer {
public void setIncrementalFallback(Boolean incrementalFallback) {
this.incrementalFallback = incrementalFallback;
}
public String getBitmapRecreated() {
return bitmapRecreated;
}
public void setBitmapRecreated(String bitmapRecreated) {
this.bitmapRecreated = bitmapRecreated;
}
}

View File

@ -42,6 +42,9 @@ public final class NASBackupChainKeys {
public static final String TYPE_FULL = "full";
public static final String TYPE_INCREMENTAL = "incremental";
/** Set to the bitmap name when this incremental had to recreate its parent bitmap on the host (informational; this incremental is larger than usual). */
public static final String BITMAP_RECREATED = "nas.bitmap_recreated";
private NASBackupChainKeys() {
}
}

View File

@ -434,6 +434,12 @@ public class NASBackupProvider extends AdapterBase implements BackupProvider, Co
backupVO.setBackedUpVolumes(backupManager.createVolumeInfoFromVolumes(volumes));
if (backupDao.update(backupVO.getId(), backupVO)) {
persistChainMetadata(backupVO, effective, answer.getBitmapCreated());
if (answer.getBitmapRecreated() != null) {
backupDetailsDao.persist(new BackupDetailVO(backupVO.getId(),
NASBackupChainKeys.BITMAP_RECREATED, answer.getBitmapRecreated(), true));
logger.info("NAS incremental for VM {} recreated parent bitmap {} (likely VM was restarted since last backup)",
vm.getInstanceName(), answer.getBitmapRecreated());
}
return new Pair<>(true, backupVO);
} else {
throw new CloudRuntimeException("Failed to update backup");

View File

@ -117,6 +117,7 @@ public class LibvirtTakeBackupCommandWrapper extends CommandWrapper<TakeBackupCo
// numeric-suffix parser keeps working.
String stdout = result.second().trim();
String bitmapCreated = null;
String bitmapRecreated = null;
boolean incrementalFallback = false;
StringBuilder filtered = new StringBuilder();
for (String line : stdout.split("\n")) {
@ -125,6 +126,10 @@ public class LibvirtTakeBackupCommandWrapper extends CommandWrapper<TakeBackupCo
bitmapCreated = trimmed.substring("BITMAP_CREATED=".length());
continue;
}
if (trimmed.startsWith("BITMAP_RECREATED=")) {
bitmapRecreated = trimmed.substring("BITMAP_RECREATED=".length());
continue;
}
if (trimmed.startsWith("INCREMENTAL_FALLBACK=")) {
incrementalFallback = true;
continue;
@ -152,6 +157,7 @@ public class LibvirtTakeBackupCommandWrapper extends CommandWrapper<TakeBackupCo
BackupAnswer answer = new BackupAnswer(command, true, stdout);
answer.setSize(backupSize);
answer.setBitmapCreated(bitmapCreated);
answer.setBitmapRecreated(bitmapRecreated);
answer.setIncrementalFallback(incrementalFallback);
return answer;
}

View File

@ -150,6 +150,29 @@ backup_running_vm() {
;;
esac
# When incremental, verify the parent bitmap still exists on the running domain.
# CloudStack rebuilds the libvirt domain XML on every VM start, so persistent bitmaps
# are lost across stop/start. If the parent is missing, recreate it as a fresh bitmap
# so libvirt accepts the <incremental> reference. The first backup after a recreate
# captures all writes since the recreate point — slightly larger than ideal, but correct.
if [[ "$effective_mode" == "incremental" ]]; then
if ! virsh -c qemu:///system checkpoint-list "$VM" --name 2>/dev/null | grep -qx "$BITMAP_PARENT"; then
cat > $dest/recreate-checkpoint.xml <<XML
<domaincheckpoint><name>$BITMAP_PARENT</name><disks>
$(virsh -c qemu:///system domblklist "$VM" --details 2>/dev/null | awk '$2=="disk"{printf "<disk name=\"%s\"/>\n", $3}')
</disks></domaincheckpoint>
XML
if ! virsh -c qemu:///system checkpoint-create "$VM" --xmlfile $dest/recreate-checkpoint.xml > /dev/null 2>&1; then
echo "Failed to recreate parent bitmap $BITMAP_PARENT for $VM"
cleanup
exit 1
fi
# Marker for the orchestrator: this incremental is larger because the bitmap was rebuilt.
echo "BITMAP_RECREATED=$BITMAP_PARENT"
rm -f $dest/recreate-checkpoint.xml
fi
fi
# Build backup XML (and matching checkpoint XML when applicable).
name="root"
echo "<domainbackup mode='push'>" > $dest/backup.xml