mirror of https://github.com/apache/cloudstack.git
feat(backup): on-demand bitmap recreation for incremental NAS backup
CloudStack rebuilds the libvirt domain XML on every VM start, which means
persistent QEMU dirty bitmaps don't survive a stop/start cycle. Rather
than hooking into the VM start lifecycle (intrusive across the
orchestration layer), this commit handles the missing bitmap *lazily* at
the next backup attempt:
nasbackup.sh
- When -M incremental is requested, the script first checks
`virsh checkpoint-list` for the parent bitmap. If absent, it
recreates the checkpoint on the running domain so libvirt accepts
the <incremental> reference. The next incremental will be larger
than usual (it captures all writes since recreate, not since the
previous incremental) but is correct; subsequent ones return to
normal size.
- On recreation, emits BITMAP_RECREATED=<name> on stdout for the
orchestrator to record.
BackupAnswer
+ bitmapRecreated field surfaced from the agent.
LibvirtTakeBackupCommandWrapper
- Strips BITMAP_RECREATED= line from stdout before size parsing.
- Sets answer.setBitmapRecreated(...).
NASBackupChainKeys
+ BITMAP_RECREATED key for backup_details.
NASBackupProvider
- When the agent reports a recreated bitmap, persists it under
backup_details and logs an info-level message so operators can
correlate larger-than-usual incrementals with VM restarts.
This satisfies the bitmap-loss-on-VM-restart concern from the RFC review
without touching VirtualMachineManager / StartCommand / agent lifecycle.
Refs: apache/cloudstack#12899
This commit is contained in:
parent
1f2aebca36
commit
43e2f7504a
|
|
@ -35,6 +35,11 @@ public class BackupAnswer extends Answer {
|
|||
// Set when an incremental was requested but the agent had to fall back to a full
|
||||
// (e.g. VM was stopped). Provider should record this backup as type=full.
|
||||
private Boolean incrementalFallback;
|
||||
// Set when the agent had to recreate the parent bitmap before this incremental
|
||||
// (e.g. CloudStack rebuilt the domain XML on the previous VM start, losing bitmaps).
|
||||
// The first incremental after a recreate is larger than usual; subsequent
|
||||
// incrementals return to normal size. Informational — recorded in backup_details.
|
||||
private String bitmapRecreated;
|
||||
|
||||
public BackupAnswer(final Command command, final boolean success, final String details) {
|
||||
super(command, success, details);
|
||||
|
|
@ -90,4 +95,12 @@ public class BackupAnswer extends Answer {
|
|||
public void setIncrementalFallback(Boolean incrementalFallback) {
|
||||
this.incrementalFallback = incrementalFallback;
|
||||
}
|
||||
|
||||
public String getBitmapRecreated() {
|
||||
return bitmapRecreated;
|
||||
}
|
||||
|
||||
public void setBitmapRecreated(String bitmapRecreated) {
|
||||
this.bitmapRecreated = bitmapRecreated;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,9 @@ public final class NASBackupChainKeys {
|
|||
public static final String TYPE_FULL = "full";
|
||||
public static final String TYPE_INCREMENTAL = "incremental";
|
||||
|
||||
/** Set to the bitmap name when this incremental had to recreate its parent bitmap on the host (informational; this incremental is larger than usual). */
|
||||
public static final String BITMAP_RECREATED = "nas.bitmap_recreated";
|
||||
|
||||
private NASBackupChainKeys() {
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -434,6 +434,12 @@ public class NASBackupProvider extends AdapterBase implements BackupProvider, Co
|
|||
backupVO.setBackedUpVolumes(backupManager.createVolumeInfoFromVolumes(volumes));
|
||||
if (backupDao.update(backupVO.getId(), backupVO)) {
|
||||
persistChainMetadata(backupVO, effective, answer.getBitmapCreated());
|
||||
if (answer.getBitmapRecreated() != null) {
|
||||
backupDetailsDao.persist(new BackupDetailVO(backupVO.getId(),
|
||||
NASBackupChainKeys.BITMAP_RECREATED, answer.getBitmapRecreated(), true));
|
||||
logger.info("NAS incremental for VM {} recreated parent bitmap {} (likely VM was restarted since last backup)",
|
||||
vm.getInstanceName(), answer.getBitmapRecreated());
|
||||
}
|
||||
return new Pair<>(true, backupVO);
|
||||
} else {
|
||||
throw new CloudRuntimeException("Failed to update backup");
|
||||
|
|
|
|||
|
|
@ -117,6 +117,7 @@ public class LibvirtTakeBackupCommandWrapper extends CommandWrapper<TakeBackupCo
|
|||
// numeric-suffix parser keeps working.
|
||||
String stdout = result.second().trim();
|
||||
String bitmapCreated = null;
|
||||
String bitmapRecreated = null;
|
||||
boolean incrementalFallback = false;
|
||||
StringBuilder filtered = new StringBuilder();
|
||||
for (String line : stdout.split("\n")) {
|
||||
|
|
@ -125,6 +126,10 @@ public class LibvirtTakeBackupCommandWrapper extends CommandWrapper<TakeBackupCo
|
|||
bitmapCreated = trimmed.substring("BITMAP_CREATED=".length());
|
||||
continue;
|
||||
}
|
||||
if (trimmed.startsWith("BITMAP_RECREATED=")) {
|
||||
bitmapRecreated = trimmed.substring("BITMAP_RECREATED=".length());
|
||||
continue;
|
||||
}
|
||||
if (trimmed.startsWith("INCREMENTAL_FALLBACK=")) {
|
||||
incrementalFallback = true;
|
||||
continue;
|
||||
|
|
@ -152,6 +157,7 @@ public class LibvirtTakeBackupCommandWrapper extends CommandWrapper<TakeBackupCo
|
|||
BackupAnswer answer = new BackupAnswer(command, true, stdout);
|
||||
answer.setSize(backupSize);
|
||||
answer.setBitmapCreated(bitmapCreated);
|
||||
answer.setBitmapRecreated(bitmapRecreated);
|
||||
answer.setIncrementalFallback(incrementalFallback);
|
||||
return answer;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -150,6 +150,29 @@ backup_running_vm() {
|
|||
;;
|
||||
esac
|
||||
|
||||
# When incremental, verify the parent bitmap still exists on the running domain.
|
||||
# CloudStack rebuilds the libvirt domain XML on every VM start, so persistent bitmaps
|
||||
# are lost across stop/start. If the parent is missing, recreate it as a fresh bitmap
|
||||
# so libvirt accepts the <incremental> reference. The first backup after a recreate
|
||||
# captures all writes since the recreate point — slightly larger than ideal, but correct.
|
||||
if [[ "$effective_mode" == "incremental" ]]; then
|
||||
if ! virsh -c qemu:///system checkpoint-list "$VM" --name 2>/dev/null | grep -qx "$BITMAP_PARENT"; then
|
||||
cat > $dest/recreate-checkpoint.xml <<XML
|
||||
<domaincheckpoint><name>$BITMAP_PARENT</name><disks>
|
||||
$(virsh -c qemu:///system domblklist "$VM" --details 2>/dev/null | awk '$2=="disk"{printf "<disk name=\"%s\"/>\n", $3}')
|
||||
</disks></domaincheckpoint>
|
||||
XML
|
||||
if ! virsh -c qemu:///system checkpoint-create "$VM" --xmlfile $dest/recreate-checkpoint.xml > /dev/null 2>&1; then
|
||||
echo "Failed to recreate parent bitmap $BITMAP_PARENT for $VM"
|
||||
cleanup
|
||||
exit 1
|
||||
fi
|
||||
# Marker for the orchestrator: this incremental is larger because the bitmap was rebuilt.
|
||||
echo "BITMAP_RECREATED=$BITMAP_PARENT"
|
||||
rm -f $dest/recreate-checkpoint.xml
|
||||
fi
|
||||
fi
|
||||
|
||||
# Build backup XML (and matching checkpoint XML when applicable).
|
||||
name="root"
|
||||
echo "<domainbackup mode='push'>" > $dest/backup.xml
|
||||
|
|
|
|||
Loading…
Reference in New Issue