diff --git a/tests/vtep-ctl.at b/tests/vtep-ctl.at index 99e97e84a..f0511ad71 100644 --- a/tests/vtep-ctl.at +++ b/tests/vtep-ctl.at @@ -318,6 +318,38 @@ CHECK_LSWITCHES([a]) VTEP_CTL_CLEANUP AT_CLEANUP +AT_SETUP([add-ls a, get-replication-mode a]) +AT_KEYWORDS([vtep-ctl]) +VTEP_CTL_SETUP +AT_CHECK([RUN_VTEP_CTL( + [add-ls a], + [get-replication-mode a])], + [0], [[(null)] +], [], [VTEP_CTL_CLEANUP]) +VTEP_CTL_CLEANUP +AT_CLEANUP + +AT_SETUP([add-ls a, set-replication-mode a source_node]) +AT_KEYWORDS([vtep-ctl]) +VTEP_CTL_SETUP +AT_CHECK([RUN_VTEP_CTL( + [add-ls a],[set-replication-mode a source_node], + [get-replication-mode a])], + [0], [source_node +], [], [VTEP_CTL_CLEANUP]) +VTEP_CTL_CLEANUP +AT_CLEANUP + +AT_SETUP([add-ls a, set-replication-mode a service_node]) +AT_KEYWORDS([vtep-ctl]) +VTEP_CTL_SETUP +AT_CHECK([RUN_VTEP_CTL( + [add-ls a],[set-replication-mode a service_node], + [get-replication-mode a])], + [0], [service_node +], [], [VTEP_CTL_CLEANUP]) +VTEP_CTL_CLEANUP +AT_CLEANUP dnl ---------------------------------------------------------------------- AT_BANNER([vtep-ctl unit tests -- logical binding tests]) diff --git a/vtep/README.ovs-vtep.md b/vtep/README.ovs-vtep.md index 6734dabfc..13d4e1d36 100644 --- a/vtep/README.ovs-vtep.md +++ b/vtep/README.ovs-vtep.md @@ -166,13 +166,39 @@ vtep-ctl bind-ls br0 p0 0 ls0 vtep-ctl set Logical_Switch ls0 tunnel_key=33 ``` -3. Direct unknown destinations out a tunnel: +3. Direct unknown destinations out a tunnel. + + For handling L2 broadcast, multicast and unknown unicast traffic, + packets can be sent to all members of a logical switch referenced by + a physical switch. The "unknown-dst" address below is used to + represent these packets. There are different modes to replicate the + packets. The default mode of replication is to send the traffic to a + service node, which can be a hypervisor, server or appliance, and let + the service node handle replication to other transport nodes + (hypervisors or other VTEP physical switches). This mode is called + _service node_ replication. An alternate mode of replication, called + _source node_ replication, involves the source node sending to all + other transport nodes. Hypervisors are always responsible for doing + their own replication for locally attached VMs in both modes. + Service node mode is the default. Service node replication mode is + considered a basic requirement because it only requires sending the + packet to a single transport node. The following configuration is + for service node replication mode as only a single transport node + destination is specified for the unknown-dst address: ``` vtep-ctl add-mcast-remote ls0 unknown-dst 10.2.2.2 ``` -4. Direct unicast destinations out a different tunnel: +4. Optionally, change the replication mode from a default of +"service\_node" to "source\_node", which can be done at the logical +switch level: + + ``` +vtep-ctl set-replication-mode ls0 source_node + ``` + +5. Direct unicast destinations out a different tunnel: ``` vtep-ctl add-ucast-remote ls0 00:11:22:33:44:55 10.2.2.3 diff --git a/vtep/ovs-vtep b/vtep/ovs-vtep index 31ff15928..a8ffb668a 100755 --- a/vtep/ovs-vtep +++ b/vtep/ovs-vtep @@ -94,6 +94,7 @@ class Logical_Switch(object): self.unknown_dsts = set() self.tunnel_key = 0 self.setup_ls() + self.replication_mode = "service_node" def __del__(self): vlog.info("destroying lswitch %s" % self.name) @@ -141,13 +142,17 @@ class Logical_Switch(object): ovs_ofctl("add-flow %s table=1,priority=1,in_port=%s,action=%s" % (self.short_name, port_no, ",".join(flood_ports))) - # Traffic coming from a VTEP physical port should only be flooded to - # one 'unknown-dst' and to all other physical ports that belong to that - # VTEP device and this logical switch. + # Traffic coming from a VTEP physical port should always be flooded to + # all the other physical ports that belong to that VTEP device and + # this logical switch. If the replication mode is service node then + # send to one unknown_dst node (the first one here); else we assume the + # replication mode is source node and we send the packet to all + # unknown_dst nodes. for tunnel in self.unknown_dsts: port_no = self.tunnels[tunnel][0] flood_ports.append(port_no) - break + if self.replication_mode == "service_node": + break ovs_ofctl("add-flow %s table=1,priority=0,action=%s" % (self.short_name, ",".join(flood_ports))) @@ -293,8 +298,30 @@ class Logical_Switch(object): self.remote_macs = remote_macs + replication_mode = vtep_ctl("get logical_switch %s replication_mode" + % self.name) + + # Replication mode is an optional column and if it is not set, + # replication mode defaults to service_node. + if replication_mode == "[]": + replication_mode = "service_node" + + # If the logical switch level replication mode has changed then + # update to that value. + update_flood_set = False + if replication_mode != self.replication_mode: + self.replication_mode = replication_mode + vlog.info("%s replication mode changed to %s" % + (self.name, self.replication_mode)) + update_flood_set = True + if (self.unknown_dsts != unknown_dsts): self.unknown_dsts = unknown_dsts + update_flood_set = True + + # If either the replication mode has changed or the unknown + # destinations set has changed, update the flooding decision. + if update_flood_set is True: self.update_flood() def update_stats(self): diff --git a/vtep/vtep-ctl.8.in b/vtep/vtep-ctl.8.in index 129c7ed63..e0f49a78d 100644 --- a/vtep/vtep-ctl.8.in +++ b/vtep/vtep-ctl.8.in @@ -195,6 +195,32 @@ combination on the physical switch \fIpswitch\fR. List the logical switch bindings for \fIport\fR on the physical switch \fIpswitch\fR. . +.IP "\fBset\-replication\-mode \fIlswitch replication\-mode\fR" +Set logical switch \fIlswitch\fR replication mode to +\fIreplication\-mode\fR; the only valid values for replication mode +are "service_node" and "source_node". +. +For handling L2 broadcast, multicast and unknown unicast traffic, +packets can be sent to all members of a logical switch referenced by +a physical switch. There are different modes to replicate the +packets. The default mode of replication is to send the traffic to +a service node, which can be a hypervisor, server or appliance, and +let the service node handle replication to other transport nodes +(hypervisors or other VTEP physical switches). This mode is called +service node replication. An alternate mode of replication, called +source node replication involves the source node sending to all +other transport nodes. Hypervisors are always responsible for doing +their own replication for locally attached VMs in both modes. +Service node mode is the default, if the replication mode is not +explicitly set. Service node replication mode is considered a basic +requirement because it only requires sending the packet to a single +transport node. +. +.IP "\fBget\-replication\-mode \fIlswitch\fR" +Get logical switch \fIlswitch\fR replication mode. The only valid values +for replication mode are "service_node" and "source_node". An empty reply +for replication mode implies a default of "service_node". +. .SS "Logical Router Commands" These commands examine and manipulate logical routers. . diff --git a/vtep/vtep-ctl.c b/vtep/vtep-ctl.c index 29d9a1773..5c189712c 100644 --- a/vtep/vtep-ctl.c +++ b/vtep/vtep-ctl.c @@ -335,6 +335,8 @@ Logical Switch commands:\n\ bind-ls PS PORT VLAN LS bind LS to VLAN on PORT\n\ unbind-ls PS PORT VLAN unbind logical switch on VLAN from PORT\n\ list-bindings PS PORT list bindings for PORT on PS\n\ + set-replication-mode LS MODE set replication mode on LS\n\ + get-replication-mode LS get replication mode on LS\n\ \n\ Logical Router commands:\n\ add-lr LR create a new logical router named LR\n\ @@ -851,6 +853,8 @@ pre_get_info(struct ctl_context *ctx) ovsdb_idl_add_column(ctx->idl, &vteprec_physical_port_col_vlan_bindings); ovsdb_idl_add_column(ctx->idl, &vteprec_logical_switch_col_name); + ovsdb_idl_add_column(ctx->idl, + &vteprec_logical_switch_col_replication_mode); ovsdb_idl_add_column(ctx->idl, &vteprec_logical_router_col_name); @@ -1523,6 +1527,39 @@ cmd_unbind_ls(struct ctl_context *ctx) vtep_ctl_context_invalidate_cache(ctx); } +static void +cmd_set_replication_mode(struct ctl_context *ctx) +{ + struct vtep_ctl_context *vtepctl_ctx = vtep_ctl_context_cast(ctx); + struct vtep_ctl_lswitch *ls; + const char *ls_name = ctx->argv[1]; + + vtep_ctl_context_populate_cache(ctx); + + if (strcmp(ctx->argv[2], "service_node") && + strcmp(ctx->argv[2], "source_node")) { + ctl_fatal("Replication mode must be 'service_node' or 'source_node'"); + } + + ls = find_lswitch(vtepctl_ctx, ls_name, true); + vteprec_logical_switch_set_replication_mode(ls->ls_cfg, ctx->argv[2]); + + vtep_ctl_context_invalidate_cache(ctx); +} + +static void +cmd_get_replication_mode(struct ctl_context *ctx) +{ + struct vtep_ctl_context *vtepctl_ctx = vtep_ctl_context_cast(ctx); + struct vtep_ctl_lswitch *ls; + const char *ls_name = ctx->argv[1]; + + vtep_ctl_context_populate_cache(ctx); + + ls = find_lswitch(vtepctl_ctx, ls_name, true); + ds_put_format(&ctx->output, "%s\n", ls->ls_cfg->replication_mode); +} + static struct vtep_ctl_lrouter * find_lrouter(struct vtep_ctl_context *vtepctl_ctx, const char *name, bool must_exist) @@ -2459,6 +2496,10 @@ static const struct ctl_command_syntax vtep_commands[] = { {"list-bindings", 2, 2, NULL, pre_get_info, cmd_list_bindings, NULL, "", RO}, {"bind-ls", 4, 4, NULL, pre_get_info, cmd_bind_ls, NULL, "", RO}, {"unbind-ls", 3, 3, NULL, pre_get_info, cmd_unbind_ls, NULL, "", RO}, + {"set-replication-mode", 2, 2, "LS MODE", pre_get_info, + cmd_set_replication_mode, NULL, "", RW}, + {"get-replication-mode", 1, 1, "LS", pre_get_info, + cmd_get_replication_mode, NULL, "", RO}, /* Logical Router commands. */ {"add-lr", 1, 1, NULL, pre_get_info, cmd_add_lr, NULL, "--may-exist", RW}, diff --git a/vtep/vtep.ovsschema b/vtep/vtep.ovsschema index 533fd2e57..e409d8d56 100644 --- a/vtep/vtep.ovsschema +++ b/vtep/vtep.ovsschema @@ -1,6 +1,6 @@ { "name": "hardware_vtep", - "cksum": "770244945 11113", + "cksum": "4127261095 11302", "tables": { "Global": { "columns": { @@ -96,6 +96,11 @@ "name": {"type": "string"}, "description": {"type": "string"}, "tunnel_key": {"type": {"key": "integer", "min": 0, "max": 1}}, + "replication_mode": { + "type": { + "key": { + "enum": ["set", ["service_node", "source_node"]], + "type": "string"},"min": 0, "max": 1}}, "other_config": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}, @@ -296,4 +301,4 @@ "ephemeral": true}}, "indexes": [["target"]], "isRoot": false}}, - "version": "1.5.1"} + "version": "1.6.0"} diff --git a/vtep/vtep.xml b/vtep/vtep.xml index a3a69885e..ad014ab5d 100644 --- a/vtep/vtep.xml +++ b/vtep/vtep.xml @@ -357,6 +357,28 @@ Indicates that an error has occurred in the switch but that no more specific information is available. + + + Indicates that the requested source node replication mode cannot be + supported by the physical switch; this specifically means in this + context that the physical switch lacks the capability to support + source node replication mode. This error occurs when a controller + attempts to set source node replication mode for one of the logical + switches that the physical switch is keeping context for. An NVC + that observes this error should take appropriate action (for example + reverting the logical switch to service node replication mode). + It is recommended that an NVC be proactive and test for support of + source node replication by using a test logical switch on vtep + physical switch nodes and then trying to change the replication mode + to source node on this logical switch, checking for error. The NVC + could remember this capability per vtep physical switch. Using + mixed replication modes on a given logical switch is not recommended. + Service node replication mode is considered a basic requirement + since it only requires sending a packet to a single transport node, + hence it is not expected that a switch should report that service + node mode cannot be supported. + @@ -754,6 +776,35 @@ + +

+ For handling L2 broadcast, multicast and unknown unicast traffic, + packets can be sent to all members of a logical switch referenced by + a physical switch. There are different modes to replicate the + packets. The default mode of replication is to send the traffic to + a service node, which can be a hypervisor, server or appliance, and + let the service node handle replication to other transport nodes + (hypervisors or other VTEP physical switches). This mode is called + service node replication. An alternate mode of replication, called + source node replication involves the source node sending to all + other transport nodes. Hypervisors are always responsible for doing + their own replication for locally attached VMs in both modes. + Service node replication mode is the default and considered a + basic requirement because it only requires sending the packet to + a single transport node. +

+ + +

+ This optional column defines the replication mode per + . There are 2 valid values, + service_node and source_node. If the + column is not set, the replication mode defaults to service_node. +

+ +
+
+ Symbolic name for the logical switch. @@ -887,8 +938,8 @@ Multicast packet replication may be handled by a service node, in which case the physical locators will be IP addresses of service nodes. If the VTEP supports replication onto multiple - tunnels, then this may be used to replicate directly onto - VTEP-hypervisor tunnels. + tunnels, using source node replication, then this may be used to + replicate directly onto VTEP-hypervisor or VTEP-VTEP tunnels.

@@ -911,9 +962,14 @@ The physical locator set to be used to reach this MAC address. In - this table, the physical locator set will be either a service node IP - address or a set of tunnel IP addresses of hypervisors (and - potentially other VTEPs). + this table, the physical locator set will be either a set of service + nodes when service node replication is used or the set of transport + nodes (defined as hypervisors or VTEPs) participating in the associated + logical switch, when source node replication is used. When service node + replication is used, the VTEP should send packets to one member of the + locator set that is known to be healthy and reachable, which could be + determined by BFD. When source node replication is used, the VTEP + should send packets to all members of the locator set.