Ayaka Koshibe

fixes in mastership reelection for single-node failure

Change-Id: Iedcab52bb156643464a97435fcc39c5db7393976
......@@ -4,6 +4,7 @@ import static com.google.common.base.Preconditions.checkNotNull;
import static org.slf4j.LoggerFactory.getLogger;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
......@@ -14,6 +15,7 @@ import org.apache.felix.scr.annotations.Service;
import org.onlab.onos.cluster.ClusterEvent;
import org.onlab.onos.cluster.ClusterEventListener;
import org.onlab.onos.cluster.ClusterService;
import org.onlab.onos.cluster.ControllerNode;
import org.onlab.onos.cluster.MastershipAdminService;
import org.onlab.onos.cluster.MastershipEvent;
import org.onlab.onos.cluster.MastershipListener;
......@@ -164,21 +166,68 @@ implements MastershipService, MastershipAdminService {
//callback for reacting to cluster events
private class InternalClusterEventListener implements ClusterEventListener {
// A notion of a local maximum cluster size, used to tie-break.
// Think of a better way to do this.
private AtomicInteger clusterSize;
InternalClusterEventListener() {
clusterSize = new AtomicInteger(0);
}
@Override
public void event(ClusterEvent event) {
switch (event.type()) {
//FIXME: worry about addition when the time comes
case INSTANCE_ADDED:
case INSTANCE_ACTIVATED:
break;
clusterSize.incrementAndGet();
log.info("instance {} added/activated", event.subject());
break;
case INSTANCE_REMOVED:
case INSTANCE_DEACTIVATED:
ControllerNode node = event.subject();
if (node.equals(clusterService.getLocalNode())) {
//If we are in smaller cluster, relinquish and return
for (DeviceId device : getDevicesOf(node.id())) {
if (!isInMajority()) {
//own DeviceManager should catch event and tell switch
store.relinquishRole(node.id(), device);
}
}
log.info("broke off from cluster, relinquished devices");
break;
}
// if we are the larger one and the removed node(s) are brain dead,
// force relinquish on behalf of disabled node.
// check network channel to do this?
for (DeviceId device : getDevicesOf(node.id())) {
//some things to check:
// 1. we didn't break off as well while we're at it
// 2. others don't pile in and try too - maybe a lock
if (isInMajority()) {
store.relinquishRole(node.id(), device);
}
}
clusterSize.decrementAndGet();
log.info("instance {} removed/deactivated", event.subject());
break;
default:
log.warn("unknown cluster event {}", event);
}
}
private boolean isInMajority() {
if (clusterService.getNodes().size() > (clusterSize.intValue() / 2)) {
return true;
}
//else {
//FIXME: break tie for equal-sized clusters, can we use hz's functions?
// }
return false;
}
}
public class InternalDelegate implements MastershipStoreDelegate {
......
......@@ -26,6 +26,7 @@ import org.onlab.onos.net.DeviceId;
import org.onlab.onos.net.MastershipRole;
import org.onlab.onos.net.Port;
import org.onlab.onos.net.PortNumber;
import org.onlab.onos.net.device.DefaultDeviceDescription;
import org.onlab.onos.net.device.DeviceAdminService;
import org.onlab.onos.net.device.DeviceDescription;
import org.onlab.onos.net.device.DeviceEvent;
......@@ -257,12 +258,12 @@ public class DeviceManager
// temporarily request for Master Role and mark offline.
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
log.debug("Device {} disconnected, but I am not the master", deviceId);
//let go of any role anyways
//let go of ability to be backup
mastershipService.relinquishMastership(deviceId);
return;
}
DeviceEvent event = store.markOffline(deviceId);
//we're no longer capable of being master or a candidate.
//relinquish master role and ability to be backup.
mastershipService.relinquishMastership(deviceId);
if (event != null) {
......@@ -325,23 +326,31 @@ public class DeviceManager
@Override
public void event(MastershipEvent event) {
final DeviceId did = event.subject();
if (isAvailable(did)) {
final NodeId myNodeId = clusterService.getLocalNode().id();
if (myNodeId.equals(event.master())) {
MastershipTerm term = termService.getMastershipTerm(did);
if (term.master().equals(myNodeId)) {
// only set the new term if I am the master
clockProviderService.setMastershipTerm(did, term);
}
applyRole(did, MastershipRole.MASTER);
} else {
applyRole(did, MastershipRole.STANDBY);
final NodeId myNodeId = clusterService.getLocalNode().id();
if (myNodeId.equals(event.master())) {
MastershipTerm term = termService.getMastershipTerm(did);
if (term.master().equals(myNodeId)) {
// only set the new term if I am the master
clockProviderService.setMastershipTerm(did, term);
}
// FIXME: we should check that the device is connected on our end.
// currently, this is not straight forward as the actual switch
// implementation is hidden from the registry.
if (!isAvailable(did)) {
//flag the device as online. Is there a better way to do this?
Device device = getDevice(did);
store.createOrUpdateDevice(device.providerId(), did,
new DefaultDeviceDescription(
did.uri(), device.type(), device.manufacturer(),
device.hwVersion(), device.swVersion(),
device.serialNumber()));
}
applyRole(did, MastershipRole.MASTER);
} else {
//device dead to node, give up
mastershipService.relinquishMastership(did);
applyRole(did, MastershipRole.STANDBY);
}
}
......
......@@ -18,6 +18,8 @@ import org.onlab.onos.net.DeviceId;
import org.onlab.onos.store.trivial.impl.SimpleMastershipStore;
import org.onlab.packet.IpPrefix;
import com.google.common.collect.Sets;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.onlab.onos.net.MastershipRole.*;
......@@ -143,7 +145,7 @@ public class MastershipManagerTest {
@Override
public Set<ControllerNode> getNodes() {
return null;
return Sets.newHashSet();
}
@Override
......