Madan Jampani
Committed by Gerrit Code Review

Fix the case of dropped flowEntries that can occur when the current backup is do…

…wn and no alternate backup exists

Change-Id: If584cb4bb99aa3930ac30ff6b85a5e2ed56071f0
...@@ -614,13 +614,28 @@ public class NewDistributedFlowRuleStore ...@@ -614,13 +614,28 @@ public class NewDistributedFlowRuleStore
614 // ignore since this event is for a device this node does not manage. 614 // ignore since this event is for a device this node does not manage.
615 return; 615 return;
616 } 616 }
617 - NodeId latestBackupNode = getBackupNode(deviceId); 617 + NodeId newBackupNode = getBackupNode(deviceId);
618 - NodeId existingBackupNode = lastBackupNodes.get(deviceId); 618 + NodeId currentBackupNode = lastBackupNodes.get(deviceId);
619 - if (Objects.equal(latestBackupNode, existingBackupNode)) { 619 + if (Objects.equal(newBackupNode, currentBackupNode)) {
620 // ignore since backup location hasn't changed. 620 // ignore since backup location hasn't changed.
621 return; 621 return;
622 } 622 }
623 - backupSenderExecutor.schedule(() -> backupFlowEntries(latestBackupNode, Sets.newHashSet(deviceId)), 623 + if (currentBackupNode != null && newBackupNode == null) {
624 + // Current backup node is most likely down and no alternate backup node
625 + // has been chosen. Clear current backup location so that we can resume
626 + // backups when either current backup comes online or a different backup node
627 + // is chosen.
628 + log.warn("Lost backup location {} for deviceId {} and no alternate backup node exists. "
629 + + "Flows can be lost if the master goes down", currentBackupNode, deviceId);
630 + lastBackupNodes.remove(deviceId);
631 + lastBackupTimes.remove(deviceId);
632 + return;
633 + // TODO: Pick any available node as backup and ensure hand-off occurs when
634 + // a new master is elected.
635 + }
636 + log.info("Backup location for {} has changed from {} to {}.",
637 + deviceId, currentBackupNode, newBackupNode);
638 + backupSenderExecutor.schedule(() -> backupFlowEntries(newBackupNode, Sets.newHashSet(deviceId)),
624 0, 639 0,
625 TimeUnit.SECONDS); 640 TimeUnit.SECONDS);
626 } 641 }
...@@ -712,8 +727,9 @@ public class NewDistributedFlowRuleStore ...@@ -712,8 +727,9 @@ public class NewDistributedFlowRuleStore
712 Long lastBackupTime = lastBackupTimes.get(deviceId); 727 Long lastBackupTime = lastBackupTimes.get(deviceId);
713 Long lastUpdateTime = lastUpdateTimes.get(deviceId); 728 Long lastUpdateTime = lastUpdateTimes.get(deviceId);
714 NodeId lastBackupNode = lastBackupNodes.get(deviceId); 729 NodeId lastBackupNode = lastBackupNodes.get(deviceId);
730 + NodeId newBackupNode = getBackupNode(deviceId);
715 return lastBackupTime == null 731 return lastBackupTime == null
716 - || !Objects.equal(lastBackupNode, getBackupNode(deviceId)) 732 + || !Objects.equal(lastBackupNode, newBackupNode)
717 || (lastUpdateTime != null && lastUpdateTime > lastBackupTime); 733 || (lastUpdateTime != null && lastUpdateTime > lastBackupTime);
718 }) 734 })
719 .collect(Collectors.toSet()); 735 .collect(Collectors.toSet());
...@@ -735,7 +751,7 @@ public class NewDistributedFlowRuleStore ...@@ -735,7 +751,7 @@ public class NewDistributedFlowRuleStore
735 } 751 }
736 752
737 private void onBackupReceipt(Map<DeviceId, Map<FlowId, Set<StoredFlowEntry>>> flowTables) { 753 private void onBackupReceipt(Map<DeviceId, Map<FlowId, Set<StoredFlowEntry>>> flowTables) {
738 - log.debug("Received flows for {} to backup", flowTables.keySet()); 754 + log.debug("Received flowEntries for {} to backup", flowTables.keySet());
739 Set<DeviceId> managedDevices = mastershipService.getDevicesOf(local); 755 Set<DeviceId> managedDevices = mastershipService.getDevicesOf(local);
740 // Only process those devices are that not managed by the local node. 756 // Only process those devices are that not managed by the local node.
741 Maps.filterKeys(flowTables, deviceId -> !managedDevices.contains(deviceId)) 757 Maps.filterKeys(flowTables, deviceId -> !managedDevices.contains(deviceId))
......