Ayaka Koshibe

fixes in mastership reelection for single-node failure

Change-Id: Iedcab52bb156643464a97435fcc39c5db7393976
...@@ -4,6 +4,7 @@ import static com.google.common.base.Preconditions.checkNotNull; ...@@ -4,6 +4,7 @@ import static com.google.common.base.Preconditions.checkNotNull;
4 import static org.slf4j.LoggerFactory.getLogger; 4 import static org.slf4j.LoggerFactory.getLogger;
5 5
6 import java.util.Set; 6 import java.util.Set;
7 +import java.util.concurrent.atomic.AtomicInteger;
7 8
8 import org.apache.felix.scr.annotations.Activate; 9 import org.apache.felix.scr.annotations.Activate;
9 import org.apache.felix.scr.annotations.Component; 10 import org.apache.felix.scr.annotations.Component;
...@@ -14,6 +15,7 @@ import org.apache.felix.scr.annotations.Service; ...@@ -14,6 +15,7 @@ import org.apache.felix.scr.annotations.Service;
14 import org.onlab.onos.cluster.ClusterEvent; 15 import org.onlab.onos.cluster.ClusterEvent;
15 import org.onlab.onos.cluster.ClusterEventListener; 16 import org.onlab.onos.cluster.ClusterEventListener;
16 import org.onlab.onos.cluster.ClusterService; 17 import org.onlab.onos.cluster.ClusterService;
18 +import org.onlab.onos.cluster.ControllerNode;
17 import org.onlab.onos.cluster.MastershipAdminService; 19 import org.onlab.onos.cluster.MastershipAdminService;
18 import org.onlab.onos.cluster.MastershipEvent; 20 import org.onlab.onos.cluster.MastershipEvent;
19 import org.onlab.onos.cluster.MastershipListener; 21 import org.onlab.onos.cluster.MastershipListener;
...@@ -164,21 +166,68 @@ implements MastershipService, MastershipAdminService { ...@@ -164,21 +166,68 @@ implements MastershipService, MastershipAdminService {
164 //callback for reacting to cluster events 166 //callback for reacting to cluster events
165 private class InternalClusterEventListener implements ClusterEventListener { 167 private class InternalClusterEventListener implements ClusterEventListener {
166 168
169 + // A notion of a local maximum cluster size, used to tie-break.
170 + // Think of a better way to do this.
171 + private AtomicInteger clusterSize;
172 +
173 + InternalClusterEventListener() {
174 + clusterSize = new AtomicInteger(0);
175 + }
176 +
167 @Override 177 @Override
168 public void event(ClusterEvent event) { 178 public void event(ClusterEvent event) {
169 switch (event.type()) { 179 switch (event.type()) {
170 //FIXME: worry about addition when the time comes 180 //FIXME: worry about addition when the time comes
171 case INSTANCE_ADDED: 181 case INSTANCE_ADDED:
172 case INSTANCE_ACTIVATED: 182 case INSTANCE_ACTIVATED:
173 - break; 183 + clusterSize.incrementAndGet();
184 + log.info("instance {} added/activated", event.subject());
185 + break;
174 case INSTANCE_REMOVED: 186 case INSTANCE_REMOVED:
175 case INSTANCE_DEACTIVATED: 187 case INSTANCE_DEACTIVATED:
188 + ControllerNode node = event.subject();
189 +
190 + if (node.equals(clusterService.getLocalNode())) {
191 + //If we are in smaller cluster, relinquish and return
192 + for (DeviceId device : getDevicesOf(node.id())) {
193 + if (!isInMajority()) {
194 + //own DeviceManager should catch event and tell switch
195 + store.relinquishRole(node.id(), device);
196 + }
197 + }
198 + log.info("broke off from cluster, relinquished devices");
199 + break;
200 + }
201 +
202 + // if we are the larger one and the removed node(s) are brain dead,
203 + // force relinquish on behalf of disabled node.
204 + // check network channel to do this?
205 + for (DeviceId device : getDevicesOf(node.id())) {
206 + //some things to check:
207 + // 1. we didn't break off as well while we're at it
208 + // 2. others don't pile in and try too - maybe a lock
209 + if (isInMajority()) {
210 + store.relinquishRole(node.id(), device);
211 + }
212 + }
213 + clusterSize.decrementAndGet();
214 + log.info("instance {} removed/deactivated", event.subject());
176 break; 215 break;
177 default: 216 default:
178 log.warn("unknown cluster event {}", event); 217 log.warn("unknown cluster event {}", event);
179 } 218 }
180 } 219 }
181 220
221 + private boolean isInMajority() {
222 + if (clusterService.getNodes().size() > (clusterSize.intValue() / 2)) {
223 + return true;
224 + }
225 + //else {
226 + //FIXME: break tie for equal-sized clusters, can we use hz's functions?
227 + // }
228 + return false;
229 + }
230 +
182 } 231 }
183 232
184 public class InternalDelegate implements MastershipStoreDelegate { 233 public class InternalDelegate implements MastershipStoreDelegate {
......
...@@ -26,6 +26,7 @@ import org.onlab.onos.net.DeviceId; ...@@ -26,6 +26,7 @@ import org.onlab.onos.net.DeviceId;
26 import org.onlab.onos.net.MastershipRole; 26 import org.onlab.onos.net.MastershipRole;
27 import org.onlab.onos.net.Port; 27 import org.onlab.onos.net.Port;
28 import org.onlab.onos.net.PortNumber; 28 import org.onlab.onos.net.PortNumber;
29 +import org.onlab.onos.net.device.DefaultDeviceDescription;
29 import org.onlab.onos.net.device.DeviceAdminService; 30 import org.onlab.onos.net.device.DeviceAdminService;
30 import org.onlab.onos.net.device.DeviceDescription; 31 import org.onlab.onos.net.device.DeviceDescription;
31 import org.onlab.onos.net.device.DeviceEvent; 32 import org.onlab.onos.net.device.DeviceEvent;
...@@ -257,12 +258,12 @@ public class DeviceManager ...@@ -257,12 +258,12 @@ public class DeviceManager
257 // temporarily request for Master Role and mark offline. 258 // temporarily request for Master Role and mark offline.
258 if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { 259 if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
259 log.debug("Device {} disconnected, but I am not the master", deviceId); 260 log.debug("Device {} disconnected, but I am not the master", deviceId);
260 - //let go of any role anyways 261 + //let go of ability to be backup
261 mastershipService.relinquishMastership(deviceId); 262 mastershipService.relinquishMastership(deviceId);
262 return; 263 return;
263 } 264 }
264 DeviceEvent event = store.markOffline(deviceId); 265 DeviceEvent event = store.markOffline(deviceId);
265 - //we're no longer capable of being master or a candidate. 266 + //relinquish master role and ability to be backup.
266 mastershipService.relinquishMastership(deviceId); 267 mastershipService.relinquishMastership(deviceId);
267 268
268 if (event != null) { 269 if (event != null) {
...@@ -325,23 +326,31 @@ public class DeviceManager ...@@ -325,23 +326,31 @@ public class DeviceManager
325 @Override 326 @Override
326 public void event(MastershipEvent event) { 327 public void event(MastershipEvent event) {
327 final DeviceId did = event.subject(); 328 final DeviceId did = event.subject();
328 - if (isAvailable(did)) { 329 + final NodeId myNodeId = clusterService.getLocalNode().id();
329 - final NodeId myNodeId = clusterService.getLocalNode().id(); 330 +
330 - 331 + if (myNodeId.equals(event.master())) {
331 - if (myNodeId.equals(event.master())) { 332 + MastershipTerm term = termService.getMastershipTerm(did);
332 - MastershipTerm term = termService.getMastershipTerm(did); 333 +
333 - 334 + if (term.master().equals(myNodeId)) {
334 - if (term.master().equals(myNodeId)) { 335 + // only set the new term if I am the master
335 - // only set the new term if I am the master 336 + clockProviderService.setMastershipTerm(did, term);
336 - clockProviderService.setMastershipTerm(did, term);
337 - }
338 - applyRole(did, MastershipRole.MASTER);
339 - } else {
340 - applyRole(did, MastershipRole.STANDBY);
341 } 337 }
338 +
339 + // FIXME: we should check that the device is connected on our end.
340 + // currently, this is not straight forward as the actual switch
341 + // implementation is hidden from the registry.
342 + if (!isAvailable(did)) {
343 + //flag the device as online. Is there a better way to do this?
344 + Device device = getDevice(did);
345 + store.createOrUpdateDevice(device.providerId(), did,
346 + new DefaultDeviceDescription(
347 + did.uri(), device.type(), device.manufacturer(),
348 + device.hwVersion(), device.swVersion(),
349 + device.serialNumber()));
350 + }
351 +
352 + applyRole(did, MastershipRole.MASTER);
342 } else { 353 } else {
343 - //device dead to node, give up
344 - mastershipService.relinquishMastership(did);
345 applyRole(did, MastershipRole.STANDBY); 354 applyRole(did, MastershipRole.STANDBY);
346 } 355 }
347 } 356 }
......
...@@ -18,6 +18,8 @@ import org.onlab.onos.net.DeviceId; ...@@ -18,6 +18,8 @@ import org.onlab.onos.net.DeviceId;
18 import org.onlab.onos.store.trivial.impl.SimpleMastershipStore; 18 import org.onlab.onos.store.trivial.impl.SimpleMastershipStore;
19 import org.onlab.packet.IpPrefix; 19 import org.onlab.packet.IpPrefix;
20 20
21 +import com.google.common.collect.Sets;
22 +
21 import static org.junit.Assert.assertEquals; 23 import static org.junit.Assert.assertEquals;
22 import static org.junit.Assert.assertNull; 24 import static org.junit.Assert.assertNull;
23 import static org.onlab.onos.net.MastershipRole.*; 25 import static org.onlab.onos.net.MastershipRole.*;
...@@ -143,7 +145,7 @@ public class MastershipManagerTest { ...@@ -143,7 +145,7 @@ public class MastershipManagerTest {
143 145
144 @Override 146 @Override
145 public Set<ControllerNode> getNodes() { 147 public Set<ControllerNode> getNodes() {
146 - return null; 148 + return Sets.newHashSet();
147 } 149 }
148 150
149 @Override 151 @Override
......