From d2e1446d81725c351dc73a03b397ce043fb18452 Mon Sep 17 00:00:00 2001 From: Meizu OpenSource Date: Mon, 15 Aug 2016 10:19:42 +0800 Subject: first commit --- net/openvswitch/Kconfig | 28 + net/openvswitch/Makefile | 14 + net/openvswitch/actions.c | 537 +++++++++ net/openvswitch/datapath.c | 2151 ++++++++++++++++++++++++++++++++++ net/openvswitch/datapath.h | 182 +++ net/openvswitch/dp_notify.c | 100 ++ net/openvswitch/flow.c | 1357 +++++++++++++++++++++ net/openvswitch/flow.h | 183 +++ net/openvswitch/vport-internal_dev.c | 249 ++++ net/openvswitch/vport-internal_dev.h | 28 + net/openvswitch/vport-netdev.c | 204 ++++ net/openvswitch/vport-netdev.h | 44 + net/openvswitch/vport.c | 399 +++++++ net/openvswitch/vport.h | 197 ++++ 14 files changed, 5673 insertions(+) create mode 100644 net/openvswitch/Kconfig create mode 100644 net/openvswitch/Makefile create mode 100644 net/openvswitch/actions.c create mode 100644 net/openvswitch/datapath.c create mode 100644 net/openvswitch/datapath.h create mode 100644 net/openvswitch/dp_notify.c create mode 100644 net/openvswitch/flow.c create mode 100644 net/openvswitch/flow.h create mode 100644 net/openvswitch/vport-internal_dev.c create mode 100644 net/openvswitch/vport-internal_dev.h create mode 100644 net/openvswitch/vport-netdev.c create mode 100644 net/openvswitch/vport-netdev.h create mode 100644 net/openvswitch/vport.c create mode 100644 net/openvswitch/vport.h (limited to 'net/openvswitch') diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig new file mode 100644 index 000000000..d9ea33c36 --- /dev/null +++ b/net/openvswitch/Kconfig @@ -0,0 +1,28 @@ +# +# Open vSwitch +# + +config OPENVSWITCH + tristate "Open vSwitch" + ---help--- + Open vSwitch is a multilayer Ethernet switch targeted at virtualized + environments. In addition to supporting a variety of features + expected in a traditional hardware switch, it enables fine-grained + programmatic extension and flow-based control of the network. This + control is useful in a wide variety of applications but is + particularly important in multi-server virtualization deployments, + which are often characterized by highly dynamic endpoints and the + need to maintain logical abstractions for multiple tenants. + + The Open vSwitch datapath provides an in-kernel fast path for packet + forwarding. It is complemented by a userspace daemon, ovs-vswitchd, + which is able to accept configuration from a variety of sources and + translate it into packet processing rules. + + See http://openvswitch.org for more information and userspace + utilities. + + To compile this code as a module, choose M here: the module will be + called openvswitch. + + If unsure, say N. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile new file mode 100644 index 000000000..15e738474 --- /dev/null +++ b/net/openvswitch/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for Open vSwitch. +# + +obj-$(CONFIG_OPENVSWITCH) += openvswitch.o + +openvswitch-y := \ + actions.o \ + datapath.o \ + dp_notify.o \ + flow.o \ + vport.o \ + vport-internal_dev.o \ + vport-netdev.o \ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c new file mode 100644 index 000000000..c4779ca59 --- /dev/null +++ b/net/openvswitch/actions.c @@ -0,0 +1,537 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr, int len, bool keep_skb); + +static int make_writable(struct sk_buff *skb, int write_len) +{ + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) + return 0; + + return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +/* remove VLAN header from packet and update csum accordingly. */ +static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) +{ + struct vlan_hdr *vhdr; + int err; + + err = make_writable(skb, VLAN_ETH_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_sub(skb->csum, csum_partial(skb->data + + (2 * ETH_ALEN), VLAN_HLEN, 0)); + + vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + *current_tci = vhdr->h_vlan_TCI; + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + __skb_pull(skb, VLAN_HLEN); + + vlan_set_encap_proto(skb, vhdr); + skb->mac_header += VLAN_HLEN; + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); + skb_reset_mac_len(skb); + + return 0; +} + +static int pop_vlan(struct sk_buff *skb) +{ + __be16 tci; + int err; + + if (likely(vlan_tx_tag_present(skb))) { + skb->vlan_tci = 0; + } else { + if (unlikely(skb->protocol != htons(ETH_P_8021Q) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __pop_vlan_tci(skb, &tci); + if (err) + return err; + } + /* move next vlan tag to hw accel tag */ + if (likely(skb->protocol != htons(ETH_P_8021Q) || + skb->len < VLAN_ETH_HLEN)) + return 0; + + err = __pop_vlan_tci(skb, &tci); + if (unlikely(err)) + return err; + + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci)); + return 0; +} + +static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan) +{ + if (unlikely(vlan_tx_tag_present(skb))) { + u16 current_tag; + + /* push down current VLAN tag */ + current_tag = vlan_tx_tag_get(skb); + + if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) + return -ENOMEM; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(skb->data + + (2 * ETH_ALEN), VLAN_HLEN, 0)); + + } + __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); + return 0; +} + +static int set_eth_addr(struct sk_buff *skb, + const struct ovs_key_ethernet *eth_key) +{ + int err; + err = make_writable(skb, ETH_HLEN); + if (unlikely(err)) + return err; + + memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN); + memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN); + + return 0; +} + +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (nh->protocol == IPPROTO_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, + *addr, new_addr, 1); + } else if (nh->protocol == IPPROTO_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&uh->check, skb, + *addr, new_addr, 1); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + } + + csum_replace4(&nh->check, *addr, new_addr); + skb->rxhash = 0; + *addr = new_addr; +} + +static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4]) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (l4_proto == IPPROTO_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, + addr, new_addr, 1); + } else if (l4_proto == IPPROTO_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&uh->check, skb, + addr, new_addr, 1); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + } +} + +static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4], + bool recalculate_csum) +{ + if (recalculate_csum) + update_ipv6_checksum(skb, l4_proto, addr, new_addr); + + skb->rxhash = 0; + memcpy(addr, new_addr, sizeof(__be32[4])); +} + +static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc) +{ + nh->priority = tc >> 4; + nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4); +} + +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl) +{ + nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16; + nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8; + nh->flow_lbl[2] = fl & 0x000000FF; +} + +static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) +{ + csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); + nh->ttl = new_ttl; +} + +static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) +{ + struct iphdr *nh; + int err; + + err = make_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + nh = ip_hdr(skb); + + if (ipv4_key->ipv4_src != nh->saddr) + set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src); + + if (ipv4_key->ipv4_dst != nh->daddr) + set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst); + + if (ipv4_key->ipv4_tos != nh->tos) + ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos); + + if (ipv4_key->ipv4_ttl != nh->ttl) + set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl); + + return 0; +} + +static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) +{ + struct ipv6hdr *nh; + int err; + __be32 *saddr; + __be32 *daddr; + + err = make_writable(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + saddr = (__be32 *)&nh->saddr; + daddr = (__be32 *)&nh->daddr; + + if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) + set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, + ipv6_key->ipv6_src, true); + + if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { + unsigned int offset = 0; + int flags = IP6_FH_F_SKIP_RH; + bool recalc_csum = true; + + if (ipv6_ext_hdr(nh->nexthdr)) + recalc_csum = ipv6_find_hdr(skb, &offset, + NEXTHDR_ROUTING, NULL, + &flags) != NEXTHDR_ROUTING; + + set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, + ipv6_key->ipv6_dst, recalc_csum); + } + + set_ipv6_tc(nh, ipv6_key->ipv6_tclass); + set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); + nh->hop_limit = ipv6_key->ipv6_hlimit; + + return 0; +} + +/* Must follow make_writable() since that can move the skb data. */ +static void set_tp_port(struct sk_buff *skb, __be16 *port, + __be16 new_port, __sum16 *check) +{ + inet_proto_csum_replace2(check, skb, *port, new_port, 0); + *port = new_port; + skb->rxhash = 0; +} + +static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) +{ + struct udphdr *uh = udp_hdr(skb); + + if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { + set_tp_port(skb, port, new_port, &uh->check); + + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } else { + *port = new_port; + skb->rxhash = 0; + } +} + +static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key) +{ + struct udphdr *uh; + int err; + + err = make_writable(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); + if (unlikely(err)) + return err; + + uh = udp_hdr(skb); + if (udp_port_key->udp_src != uh->source) + set_udp_port(skb, &uh->source, udp_port_key->udp_src); + + if (udp_port_key->udp_dst != uh->dest) + set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); + + return 0; +} + +static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) +{ + struct tcphdr *th; + int err; + + err = make_writable(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr)); + if (unlikely(err)) + return err; + + th = tcp_hdr(skb); + if (tcp_port_key->tcp_src != th->source) + set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check); + + if (tcp_port_key->tcp_dst != th->dest) + set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check); + + return 0; +} + +static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +{ + struct vport *vport; + + if (unlikely(!skb)) + return -ENOMEM; + + vport = ovs_vport_rcu(dp, out_port); + if (unlikely(!vport)) { + kfree_skb(skb); + return -ENODEV; + } + + ovs_vport_send(vport, skb); + return 0; +} + +static int output_userspace(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{ + struct dp_upcall_info upcall; + const struct nlattr *a; + int rem; + + upcall.cmd = OVS_PACKET_CMD_ACTION; + upcall.key = &OVS_CB(skb)->flow->key; + upcall.userdata = NULL; + upcall.portid = 0; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_USERSPACE_ATTR_USERDATA: + upcall.userdata = a; + break; + + case OVS_USERSPACE_ATTR_PID: + upcall.portid = nla_get_u32(a); + break; + } + } + + return ovs_dp_upcall(dp, skb, &upcall); +} + +static int sample(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{ + const struct nlattr *acts_list = NULL; + const struct nlattr *a; + int rem; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (net_random() >= nla_get_u32(a)) + return 0; + break; + + case OVS_SAMPLE_ATTR_ACTIONS: + acts_list = a; + break; + } + } + + return do_execute_actions(dp, skb, nla_data(acts_list), + nla_len(acts_list), true); +} + +static int execute_set_action(struct sk_buff *skb, + const struct nlattr *nested_attr) +{ + int err = 0; + + switch (nla_type(nested_attr)) { + case OVS_KEY_ATTR_PRIORITY: + skb->priority = nla_get_u32(nested_attr); + break; + + case OVS_KEY_ATTR_SKB_MARK: + skb->mark = nla_get_u32(nested_attr); + break; + + case OVS_KEY_ATTR_ETHERNET: + err = set_eth_addr(skb, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_IPV4: + err = set_ipv4(skb, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_IPV6: + err = set_ipv6(skb, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_TCP: + err = set_tcp(skb, nla_data(nested_attr)); + break; + + case OVS_KEY_ATTR_UDP: + err = set_udp(skb, nla_data(nested_attr)); + break; + } + + return err; +} + +/* Execute a list of actions against 'skb'. */ +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr, int len, bool keep_skb) +{ + /* Every output action needs a separate clone of 'skb', but the common + * case is just a single output action, so that doing a clone and + * then freeing the original skbuff is wasteful. So the following code + * is slightly obscure just to avoid that. */ + int prev_port = -1; + const struct nlattr *a; + int rem; + + for (a = attr, rem = len; rem > 0; + a = nla_next(a, &rem)) { + int err = 0; + + if (prev_port != -1) { + do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); + prev_port = -1; + } + + switch (nla_type(a)) { + case OVS_ACTION_ATTR_OUTPUT: + prev_port = nla_get_u32(a); + break; + + case OVS_ACTION_ATTR_USERSPACE: + output_userspace(dp, skb, a); + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + err = push_vlan(skb, nla_data(a)); + if (unlikely(err)) /* skb already freed. */ + return err; + break; + + case OVS_ACTION_ATTR_POP_VLAN: + err = pop_vlan(skb); + break; + + case OVS_ACTION_ATTR_SET: + err = execute_set_action(skb, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample(dp, skb, a); + break; + } + + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + } + + if (prev_port != -1) { + if (keep_skb) + skb = skb_clone(skb, GFP_ATOMIC); + + do_output(dp, skb, prev_port); + } else if (!keep_skb) + consume_skb(skb); + + return 0; +} + +/* Execute a list of actions against 'skb'. */ +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) +{ + struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + + return do_execute_actions(dp, skb, acts->actions, + acts->actions_len, false); +} diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c new file mode 100644 index 000000000..d12d6b8b5 --- /dev/null +++ b/net/openvswitch/datapath.c @@ -0,0 +1,2151 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "flow.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + + +#define REHASH_FLOW_INTERVAL (10 * 60 * HZ) +static void rehash_flow_table(struct work_struct *work); +static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table); + +int ovs_net_id __read_mostly; + +static void ovs_notify(struct sk_buff *skb, struct genl_info *info, + struct genl_multicast_group *grp) +{ + genl_notify(skb, genl_info_net(info), info->snd_portid, + grp->id, info->nlhdr, GFP_KERNEL); +} + +/** + * DOC: Locking: + * + * All writes e.g. Writes to device state (add/remove datapath, port, set + * operations on vports, etc.), Writes to other state (flow table + * modifications, set miscellaneous datapath parameters, etc.) are protected + * by ovs_lock. + * + * Reads are protected by RCU. + * + * There are a few special cases (mostly stats) that have their own + * synchronization but they nest under all of above and don't interact with + * each other. + * + * The RTNL lock nests inside ovs_mutex. + */ + +static DEFINE_MUTEX(ovs_mutex); + +void ovs_lock(void) +{ + mutex_lock(&ovs_mutex); +} + +void ovs_unlock(void) +{ + mutex_unlock(&ovs_mutex); +} + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void) +{ + if (debug_locks) + return lockdep_is_held(&ovs_mutex); + else + return 1; +} +#endif + +static struct vport *new_vport(const struct vport_parms *); +static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *, + const struct dp_upcall_info *); +static int queue_userspace_packet(struct net *, int dp_ifindex, + struct sk_buff *, + const struct dp_upcall_info *); + +/* Must be called with rcu_read_lock or ovs_mutex. */ +static struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp = NULL; + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, dp_ifindex); + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + if (vport) + dp = vport->dp; + } + rcu_read_unlock(); + + return dp; +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +const char *ovs_dp_name(const struct datapath *dp) +{ + struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); + return vport->ops->get_name(vport); +} + +static int get_dpifindex(struct datapath *dp) +{ + struct vport *local; + int ifindex; + + rcu_read_lock(); + + local = ovs_vport_rcu(dp, OVSP_LOCAL); + if (local) + ifindex = netdev_vport_priv(local)->dev->ifindex; + else + ifindex = 0; + + rcu_read_unlock(); + + return ifindex; +} + +static void destroy_dp_rcu(struct rcu_head *rcu) +{ + struct datapath *dp = container_of(rcu, struct datapath, rcu); + + ovs_flow_tbl_destroy((__force struct flow_table *)dp->table); + free_percpu(dp->stats_percpu); + release_net(ovs_dp_get_net(dp)); + kfree(dp->ports); + kfree(dp); +} + +static struct hlist_head *vport_hash_bucket(const struct datapath *dp, + u16 port_no) +{ + return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; +} + +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) +{ + struct vport *vport; + struct hlist_head *head; + + head = vport_hash_bucket(dp, port_no); + hlist_for_each_entry_rcu(vport, head, dp_hash_node) { + if (vport->port_no == port_no) + return vport; + } + return NULL; +} + +/* Called with ovs_mutex. */ +static struct vport *new_vport(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_add(parms); + if (!IS_ERR(vport)) { + struct datapath *dp = parms->dp; + struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); + + hlist_add_head_rcu(&vport->dp_hash_node, head); + } + return vport; +} + +void ovs_dp_detach_port(struct vport *p) +{ + ASSERT_OVSL(); + + /* First drop references to device. */ + hlist_del_rcu(&p->dp_hash_node); + + /* Then destroy it. */ + ovs_vport_del(p); +} + +/* Must be called with rcu_read_lock. */ +void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +{ + struct datapath *dp = p->dp; + struct sw_flow *flow; + struct dp_stats_percpu *stats; + struct sw_flow_key key; + u64 *stats_counter; + int error; + int key_len; + + stats = this_cpu_ptr(dp->stats_percpu); + + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_extract(skb, p->port_no, &key, &key_len); + if (unlikely(error)) { + kfree_skb(skb); + return; + } + + /* Look up flow. */ + flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len); + if (unlikely(!flow)) { + struct dp_upcall_info upcall; + + upcall.cmd = OVS_PACKET_CMD_MISS; + upcall.key = &key; + upcall.userdata = NULL; + upcall.portid = p->upcall_portid; + ovs_dp_upcall(dp, skb, &upcall); + consume_skb(skb); + stats_counter = &stats->n_missed; + goto out; + } + + OVS_CB(skb)->flow = flow; + + stats_counter = &stats->n_hit; + ovs_flow_used(OVS_CB(skb)->flow, skb); + ovs_execute_actions(dp, skb); + +out: + /* Update datapath statistics. */ + u64_stats_update_begin(&stats->sync); + (*stats_counter)++; + u64_stats_update_end(&stats->sync); +} + +static struct genl_family dp_packet_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_PACKET_FAMILY, + .version = OVS_PACKET_VERSION, + .maxattr = OVS_PACKET_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, +}; + +int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, + const struct dp_upcall_info *upcall_info) +{ + struct dp_stats_percpu *stats; + int dp_ifindex; + int err; + + if (upcall_info->portid == 0) { + err = -ENOTCONN; + goto err; + } + + dp_ifindex = get_dpifindex(dp); + if (!dp_ifindex) { + err = -ENODEV; + goto err; + } + + if (!skb_is_gso(skb)) + err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + else + err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + if (err) + goto err; + + return 0; + +err: + stats = this_cpu_ptr(dp->stats_percpu); + + u64_stats_update_begin(&stats->sync); + stats->n_lost++; + u64_stats_update_end(&stats->sync); + + return err; +} + +static int queue_gso_packets(struct net *net, int dp_ifindex, + struct sk_buff *skb, + const struct dp_upcall_info *upcall_info) +{ + unsigned short gso_type = skb_shinfo(skb)->gso_type; + struct dp_upcall_info later_info; + struct sw_flow_key later_key; + struct sk_buff *segs, *nskb; + int err; + + segs = __skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM, false); + if (IS_ERR(segs)) + return PTR_ERR(segs); + + /* Queue all of the segments. */ + skb = segs; + do { + err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info); + if (err) + break; + + if (skb == segs && gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *upcall_info->key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + + later_info = *upcall_info; + later_info.key = &later_key; + upcall_info = &later_info; + } + } while ((skb = skb->next)); + + /* Free all of the segments. */ + skb = segs; + do { + nskb = skb->next; + if (err) + kfree_skb(skb); + else + consume_skb(skb); + } while ((skb = nskb)); + return err; +} + +static size_t key_attr_size(void) +{ + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_8021Q */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + +static size_t upcall_msg_size(const struct sk_buff *skb, + const struct nlattr *userdata) +{ + size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + /* OVS_PACKET_ATTR_USERDATA */ + if (userdata) + size += NLA_ALIGN(userdata->nla_len); + + return size; +} + +static int queue_userspace_packet(struct net *net, int dp_ifindex, + struct sk_buff *skb, + const struct dp_upcall_info *upcall_info) +{ + struct ovs_header *upcall; + struct sk_buff *nskb = NULL; + struct sk_buff *user_skb; /* to be queued to userspace */ + struct nlattr *nla; + int err; + + if (vlan_tx_tag_present(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb)); + if (!nskb) + return -ENOMEM; + + nskb->vlan_tci = 0; + skb = nskb; + } + + if (nla_attr_size(skb->len) > USHRT_MAX) { + err = -EFBIG; + goto out; + } + + user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC); + if (!user_skb) { + err = -ENOMEM; + goto out; + } + + upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, + 0, upcall_info->cmd); + upcall->dp_ifindex = dp_ifindex; + + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); + ovs_flow_to_nlattrs(upcall_info->key, user_skb); + nla_nest_end(user_skb, nla); + + if (upcall_info->userdata) + __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, + nla_len(upcall_info->userdata), + nla_data(upcall_info->userdata)); + + nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len); + + skb_copy_and_csum_dev(skb, nla_data(nla)); + + genlmsg_end(user_skb, upcall); + err = genlmsg_unicast(net, user_skb, upcall_info->portid); + +out: + kfree_skb(nskb); + return err; +} + +/* Called with ovs_mutex. */ +static int flush_flows(struct datapath *dp) +{ + struct flow_table *old_table; + struct flow_table *new_table; + + old_table = ovsl_dereference(dp->table); + new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS); + if (!new_table) + return -ENOMEM; + + rcu_assign_pointer(dp->table, new_table); + + ovs_flow_tbl_deferred_destroy(old_table); + return 0; +} + +static int validate_actions(const struct nlattr *attr, + const struct sw_flow_key *key, int depth); + +static int validate_sample(const struct nlattr *attr, + const struct sw_flow_key *key, int depth) +{ + const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; + const struct nlattr *probability, *actions; + const struct nlattr *a; + int rem; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; + if (!probability || nla_len(probability) != sizeof(u32)) + return -EINVAL; + + actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + return validate_actions(actions, key, depth + 1); +} + +static int validate_tp_port(const struct sw_flow_key *flow_key) +{ + if (flow_key->eth.type == htons(ETH_P_IP)) { + if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) + return 0; + } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { + if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) + return 0; + } + + return -EINVAL; +} + +static int validate_set(const struct nlattr *a, + const struct sw_flow_key *flow_key) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + + /* There can be only one key in a action */ + if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + return -EINVAL; + + if (key_type > OVS_KEY_ATTR_MAX || + nla_len(ovs_key) != ovs_key_lens[key_type]) + return -EINVAL; + + switch (key_type) { + const struct ovs_key_ipv4 *ipv4_key; + const struct ovs_key_ipv6 *ipv6_key; + + case OVS_KEY_ATTR_PRIORITY: + case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_ETHERNET: + break; + + case OVS_KEY_ATTR_IPV4: + if (flow_key->eth.type != htons(ETH_P_IP)) + return -EINVAL; + + if (!flow_key->ip.proto) + return -EINVAL; + + ipv4_key = nla_data(ovs_key); + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_IPV6: + if (flow_key->eth.type != htons(ETH_P_IPV6)) + return -EINVAL; + + if (!flow_key->ip.proto) + return -EINVAL; + + ipv6_key = nla_data(ovs_key); + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_TCP: + if (flow_key->ip.proto != IPPROTO_TCP) + return -EINVAL; + + return validate_tp_port(flow_key); + + case OVS_KEY_ATTR_UDP: + if (flow_key->ip.proto != IPPROTO_UDP) + return -EINVAL; + + return validate_tp_port(flow_key); + + default: + return -EINVAL; + } + + return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ + static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { + [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + }; + struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; + int error; + + error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, + attr, userspace_policy); + if (error) + return error; + + if (!a[OVS_USERSPACE_ATTR_PID] || + !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) + return -EINVAL; + + return 0; +} + +static int validate_actions(const struct nlattr *attr, + const struct sw_flow_key *key, int depth) +{ + const struct nlattr *a; + int rem, err; + + if (depth >= SAMPLE_ACTION_DEPTH) + return -EOVERFLOW; + + nla_for_each_nested(a, attr, rem) { + /* Expected argument lengths, (u32)-1 for variable length. */ + static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), + [OVS_ACTION_ATTR_POP_VLAN] = 0, + [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + }; + const struct ovs_action_push_vlan *vlan; + int type = nla_type(a); + + if (type > OVS_ACTION_ATTR_MAX || + (action_lens[type] != nla_len(a) && + action_lens[type] != (u32)-1)) + return -EINVAL; + + switch (type) { + case OVS_ACTION_ATTR_UNSPEC: + return -EINVAL; + + case OVS_ACTION_ATTR_USERSPACE: + err = validate_userspace(a); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_OUTPUT: + if (nla_get_u32(a) >= DP_MAX_PORTS) + return -EINVAL; + break; + + + case OVS_ACTION_ATTR_POP_VLAN: + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + vlan = nla_data(a); + if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + return -EINVAL; + if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_SET: + err = validate_set(a, key); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = validate_sample(a, key, depth); + if (err) + return err; + break; + + default: + return -EINVAL; + } + } + + if (rem > 0) + return -EINVAL; + + return 0; +} + +static void clear_stats(struct sw_flow *flow) +{ + flow->used = 0; + flow->tcp_flags = 0; + flow->packet_count = 0; + flow->byte_count = 0; +} + +static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) +{ + struct ovs_header *ovs_header = info->userhdr; + struct nlattr **a = info->attrs; + struct sw_flow_actions *acts; + struct sk_buff *packet; + struct sw_flow *flow; + struct datapath *dp; + struct ethhdr *eth; + int len; + int err; + int key_len; + + err = -EINVAL; + if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || + !a[OVS_PACKET_ATTR_ACTIONS]) + goto err; + + len = nla_len(a[OVS_PACKET_ATTR_PACKET]); + packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); + err = -ENOMEM; + if (!packet) + goto err; + skb_reserve(packet, NET_IP_ALIGN); + + nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); + + skb_reset_mac_header(packet); + eth = eth_hdr(packet); + + /* Normally, setting the skb 'protocol' field would be handled by a + * call to eth_type_trans(), but it assumes there's a sending + * device, which we may not have. */ + if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) + packet->protocol = eth->h_proto; + else + packet->protocol = htons(ETH_P_802_2); + + /* Build an sw_flow for sending this packet. */ + flow = ovs_flow_alloc(); + err = PTR_ERR(flow); + if (IS_ERR(flow)) + goto err_kfree_skb; + + err = ovs_flow_extract(packet, -1, &flow->key, &key_len); + if (err) + goto err_flow_free; + + err = ovs_flow_metadata_from_nlattrs(&flow->key.phy.priority, + &flow->key.phy.skb_mark, + &flow->key.phy.in_port, + a[OVS_PACKET_ATTR_KEY]); + if (err) + goto err_flow_free; + + err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0); + if (err) + goto err_flow_free; + + flow->hash = ovs_flow_hash(&flow->key, key_len); + + acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]); + err = PTR_ERR(acts); + if (IS_ERR(acts)) + goto err_flow_free; + rcu_assign_pointer(flow->sf_acts, acts); + + OVS_CB(packet)->flow = flow; + packet->priority = flow->key.phy.priority; + packet->mark = flow->key.phy.skb_mark; + + rcu_read_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto err_unlock; + + local_bh_disable(); + err = ovs_execute_actions(dp, packet); + local_bh_enable(); + rcu_read_unlock(); + + ovs_flow_free(flow); + return err; + +err_unlock: + rcu_read_unlock(); +err_flow_free: + ovs_flow_free(flow); +err_kfree_skb: + kfree_skb(packet); +err: + return err; +} + +static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { + [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, + [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, +}; + +static struct genl_ops dp_packet_genl_ops[] = { + { .cmd = OVS_PACKET_CMD_EXECUTE, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = packet_policy, + .doit = ovs_packet_cmd_execute + } +}; + +static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) +{ + int i; + struct flow_table *table = ovsl_dereference(dp->table); + + stats->n_flows = ovs_flow_tbl_count(table); + + stats->n_hit = stats->n_missed = stats->n_lost = 0; + for_each_possible_cpu(i) { + const struct dp_stats_percpu *percpu_stats; + struct dp_stats_percpu local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dp->stats_percpu, i); + + do { + start = u64_stats_fetch_begin_bh(&percpu_stats->sync); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); + + stats->n_hit += local_stats.n_hit; + stats->n_missed += local_stats.n_missed; + stats->n_lost += local_stats.n_lost; + } +} + +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { + [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, +}; + +static struct genl_family dp_flow_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_FLOW_FAMILY, + .version = OVS_FLOW_VERSION, + .maxattr = OVS_FLOW_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, +}; + +static struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP +}; + +static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) +{ + return NLMSG_ALIGN(sizeof(struct ovs_header)) + + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ + + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ + + nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */ +} + +/* Called with ovs_mutex. */ +static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, + struct sk_buff *skb, u32 portid, + u32 seq, u32 flags, u8 cmd) +{ + const int skb_orig_len = skb->len; + const struct sw_flow_actions *sf_acts; + struct ovs_flow_stats stats; + struct ovs_header *ovs_header; + struct nlattr *nla; + unsigned long used; + u8 tcp_flags; + int err; + + sf_acts = ovsl_dereference(flow->sf_acts); + + ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = get_dpifindex(dp); + + nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); + if (!nla) + goto nla_put_failure; + err = ovs_flow_to_nlattrs(&flow->key, skb); + if (err) + goto error; + nla_nest_end(skb, nla); + + spin_lock_bh(&flow->lock); + used = flow->used; + stats.n_packets = flow->packet_count; + stats.n_bytes = flow->byte_count; + tcp_flags = flow->tcp_flags; + spin_unlock_bh(&flow->lock); + + if (used && + nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) + goto nla_put_failure; + + if (stats.n_packets && + nla_put(skb, OVS_FLOW_ATTR_STATS, + sizeof(struct ovs_flow_stats), &stats)) + goto nla_put_failure; + + if (tcp_flags && + nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags)) + goto nla_put_failure; + + /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if + * this is the first flow to be dumped into 'skb'. This is unusual for + * Netlink but individual action lists can be longer than + * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. + * The userspace caller can always fetch the actions separately if it + * really wants them. (Most userspace callers in fact don't care.) + * + * This can only fail for dump operations because the skb is always + * properly sized for single flows. + */ + err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len, + sf_acts->actions); + if (err < 0 && skb_orig_len) + goto error; + + return genlmsg_end(skb, ovs_header); + +nla_put_failure: + err = -EMSGSIZE; +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow) +{ + const struct sw_flow_actions *sf_acts; + + sf_acts = ovsl_dereference(flow->sf_acts); + + return genlmsg_new(ovs_flow_cmd_msg_size(sf_acts), GFP_KERNEL); +} + +static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow, + struct datapath *dp, + u32 portid, u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = ovs_flow_cmd_alloc_info(flow); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_flow_cmd_fill_info(flow, dp, skb, portid, seq, 0, cmd); + BUG_ON(retval < 0); + return skb; +} + +static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sw_flow *flow; + struct sk_buff *reply; + struct datapath *dp; + struct flow_table *table; + int error; + int key_len; + + /* Extract key. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) + goto error; + error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + if (error) + goto error; + + /* Validate actions. */ + if (a[OVS_FLOW_ATTR_ACTIONS]) { + error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0); + if (error) + goto error; + } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { + error = -EINVAL; + goto error; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + error = -ENODEV; + if (!dp) + goto err_unlock_ovs; + + table = ovsl_dereference(dp->table); + flow = ovs_flow_tbl_lookup(table, &key, key_len); + if (!flow) { + struct sw_flow_actions *acts; + + /* Bail out if we're not allowed to create a new flow. */ + error = -ENOENT; + if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) + goto err_unlock_ovs; + + /* Expand table, if necessary, to make room. */ + if (ovs_flow_tbl_need_to_expand(table)) { + struct flow_table *new_table; + + new_table = ovs_flow_tbl_expand(table); + if (!IS_ERR(new_table)) { + rcu_assign_pointer(dp->table, new_table); + ovs_flow_tbl_deferred_destroy(table); + table = ovsl_dereference(dp->table); + } + } + + /* Allocate flow. */ + flow = ovs_flow_alloc(); + if (IS_ERR(flow)) { + error = PTR_ERR(flow); + goto err_unlock_ovs; + } + flow->key = key; + clear_stats(flow); + + /* Obtain actions. */ + acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]); + error = PTR_ERR(acts); + if (IS_ERR(acts)) + goto error_free_flow; + rcu_assign_pointer(flow->sf_acts, acts); + + /* Put flow in bucket. */ + flow->hash = ovs_flow_hash(&key, key_len); + ovs_flow_tbl_insert(table, flow); + + reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, + info->snd_seq, + OVS_FLOW_CMD_NEW); + } else { + /* We found a matching flow. */ + struct sw_flow_actions *old_acts; + struct nlattr *acts_attrs; + + /* Bail out if we're not allowed to modify an existing flow. + * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL + * because Generic Netlink treats the latter as a dump + * request. We also accept NLM_F_EXCL in case that bug ever + * gets fixed. + */ + error = -EEXIST; + if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW && + info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) + goto err_unlock_ovs; + + /* Update actions. */ + old_acts = ovsl_dereference(flow->sf_acts); + acts_attrs = a[OVS_FLOW_ATTR_ACTIONS]; + if (acts_attrs && + (old_acts->actions_len != nla_len(acts_attrs) || + memcmp(old_acts->actions, nla_data(acts_attrs), + old_acts->actions_len))) { + struct sw_flow_actions *new_acts; + + new_acts = ovs_flow_actions_alloc(acts_attrs); + error = PTR_ERR(new_acts); + if (IS_ERR(new_acts)) + goto err_unlock_ovs; + + rcu_assign_pointer(flow->sf_acts, new_acts); + ovs_flow_deferred_free_acts(old_acts); + } + + reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, + info->snd_seq, OVS_FLOW_CMD_NEW); + + /* Clear stats. */ + if (a[OVS_FLOW_ATTR_CLEAR]) { + spin_lock_bh(&flow->lock); + clear_stats(flow); + spin_unlock_bh(&flow->lock); + } + } + ovs_unlock(); + + if (!IS_ERR(reply)) + ovs_notify(reply, info, &ovs_dp_flow_multicast_group); + else + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, + ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); + return 0; + +error_free_flow: + ovs_flow_free(flow); +err_unlock_ovs: + ovs_unlock(); +error: + return error; +} + +static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow; + struct datapath *dp; + struct flow_table *table; + int err; + int key_len; + + if (!a[OVS_FLOW_ATTR_KEY]) + return -EINVAL; + err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + if (err) + return err; + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto unlock; + } + + table = ovsl_dereference(dp->table); + flow = ovs_flow_tbl_lookup(table, &key, key_len); + if (!flow) { + err = -ENOENT; + goto unlock; + } + + reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, + info->snd_seq, OVS_FLOW_CMD_NEW); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto unlock; + } + + ovs_unlock(); + return genlmsg_reply(reply, info); +unlock: + ovs_unlock(); + return err; +} + +static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow; + struct datapath *dp; + struct flow_table *table; + int err; + int key_len; + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto unlock; + } + + if (!a[OVS_FLOW_ATTR_KEY]) { + err = flush_flows(dp); + goto unlock; + } + err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + if (err) + goto unlock; + + table = ovsl_dereference(dp->table); + flow = ovs_flow_tbl_lookup(table, &key, key_len); + if (!flow) { + err = -ENOENT; + goto unlock; + } + + reply = ovs_flow_cmd_alloc_info(flow); + if (!reply) { + err = -ENOMEM; + goto unlock; + } + + ovs_flow_tbl_remove(table, flow); + + err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_FLOW_CMD_DEL); + BUG_ON(err < 0); + + ovs_flow_deferred_free(flow); + ovs_unlock(); + + ovs_notify(reply, info, &ovs_dp_flow_multicast_group); + return 0; +unlock: + ovs_unlock(); + return err; +} + +static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct datapath *dp; + struct flow_table *table; + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + ovs_unlock(); + return -ENODEV; + } + + table = ovsl_dereference(dp->table); + + for (;;) { + struct sw_flow *flow; + u32 bucket, obj; + + bucket = cb->args[0]; + obj = cb->args[1]; + flow = ovs_flow_tbl_next(table, &bucket, &obj); + if (!flow) + break; + + if (ovs_flow_cmd_fill_info(flow, dp, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_FLOW_CMD_NEW) < 0) + break; + + cb->args[0] = bucket; + cb->args[1] = obj; + } + ovs_unlock(); + return skb->len; +} + +static struct genl_ops dp_flow_genl_ops[] = { + { .cmd = OVS_FLOW_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_new_or_set + }, + { .cmd = OVS_FLOW_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_del + }, + { .cmd = OVS_FLOW_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_get, + .dumpit = ovs_flow_cmd_dump + }, + { .cmd = OVS_FLOW_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_new_or_set, + }, +}; + +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { + [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, +}; + +static struct genl_family dp_datapath_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_DATAPATH_FAMILY, + .version = OVS_DATAPATH_VERSION, + .maxattr = OVS_DP_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, +}; + +static struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP +}; + +static size_t ovs_dp_cmd_msg_size(void) +{ + size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); + + msgsize += nla_total_size(IFNAMSIZ); + msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); + + return msgsize; +} + +static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_dp_stats dp_stats; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, + flags, cmd); + if (!ovs_header) + goto error; + + ovs_header->dp_ifindex = get_dpifindex(dp); + + rcu_read_lock(); + err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); + rcu_read_unlock(); + if (err) + goto nla_put_failure; + + get_dp_stats(dp, &dp_stats); + if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats)) + goto nla_put_failure; + + return genlmsg_end(skb, ovs_header); + +nla_put_failure: + genlmsg_cancel(skb, ovs_header); +error: + return -EMSGSIZE; +} + +static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 portid, + u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_dp_cmd_fill_info(dp, skb, portid, seq, 0, cmd); + if (retval < 0) { + kfree_skb(skb); + return ERR_PTR(retval); + } + return skb; +} + +/* Called with ovs_mutex. */ +static struct datapath *lookup_datapath(struct net *net, + struct ovs_header *ovs_header, + struct nlattr *a[OVS_DP_ATTR_MAX + 1]) +{ + struct datapath *dp; + + if (!a[OVS_DP_ATTR_NAME]) + dp = get_dp(net, ovs_header->dp_ifindex); + else { + struct vport *vport; + + rcu_read_lock(); + vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); + dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; + rcu_read_unlock(); + } + return dp ? dp : ERR_PTR(-ENODEV); +} + +static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct vport_parms parms; + struct sk_buff *reply; + struct datapath *dp; + struct vport *vport; + struct ovs_net *ovs_net; + int err, i; + + err = -EINVAL; + if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) + goto err; + + ovs_lock(); + + err = -ENOMEM; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (dp == NULL) + goto err_unlock_ovs; + + ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); + + /* Allocate table. */ + err = -ENOMEM; + rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS)); + if (!dp->table) + goto err_free_dp; + + dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); + if (!dp->stats_percpu) { + err = -ENOMEM; + goto err_destroy_table; + } + + dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) { + err = -ENOMEM; + goto err_destroy_percpu; + } + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + /* Set up our datapath device. */ + parms.name = nla_data(a[OVS_DP_ATTR_NAME]); + parms.type = OVS_VPORT_TYPE_INTERNAL; + parms.options = NULL; + parms.dp = dp; + parms.port_no = OVSP_LOCAL; + parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + + vport = new_vport(&parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + if (err == -EBUSY) + err = -EEXIST; + + goto err_destroy_ports_array; + } + + reply = ovs_dp_cmd_build_info(dp, info->snd_portid, + info->snd_seq, OVS_DP_CMD_NEW); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto err_destroy_local_port; + + ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); + list_add_tail(&dp->list_node, &ovs_net->dps); + + ovs_unlock(); + + ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + return 0; + +err_destroy_local_port: + ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); +err_destroy_ports_array: + kfree(dp->ports); +err_destroy_percpu: + free_percpu(dp->stats_percpu); +err_destroy_table: + ovs_flow_tbl_destroy(ovsl_dereference(dp->table)); +err_free_dp: + release_net(ovs_dp_get_net(dp)); + kfree(dp); +err_unlock_ovs: + ovs_unlock(); +err: + return err; +} + +/* Called with ovs_mutex. */ +static void __dp_destroy(struct datapath *dp) +{ + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + struct hlist_node *n; + + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) + if (vport->port_no != OVSP_LOCAL) + ovs_dp_detach_port(vport); + } + + list_del(&dp->list_node); + + /* OVSP_LOCAL is datapath internal port. We need to make sure that + * all port in datapath are destroyed first before freeing datapath. + */ + ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + + call_rcu(&dp->rcu, destroy_dp_rcu); +} + +static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + err = PTR_ERR(dp); + if (IS_ERR(dp)) + goto unlock; + + reply = ovs_dp_cmd_build_info(dp, info->snd_portid, + info->snd_seq, OVS_DP_CMD_DEL); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto unlock; + + __dp_destroy(dp); + ovs_unlock(); + + ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + + return 0; +unlock: + ovs_unlock(); + return err; +} + +static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + err = PTR_ERR(dp); + if (IS_ERR(dp)) + goto unlock; + + reply = ovs_dp_cmd_build_info(dp, info->snd_portid, + info->snd_seq, OVS_DP_CMD_NEW); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, + ovs_dp_datapath_multicast_group.id, err); + err = 0; + goto unlock; + } + + ovs_unlock(); + ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + + return 0; +unlock: + ovs_unlock(); + return err; +} + +static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + if (IS_ERR(dp)) { + err = PTR_ERR(dp); + goto unlock; + } + + reply = ovs_dp_cmd_build_info(dp, info->snd_portid, + info->snd_seq, OVS_DP_CMD_NEW); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto unlock; + } + + ovs_unlock(); + return genlmsg_reply(reply, info); + +unlock: + ovs_unlock(); + return err; +} + +static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); + struct datapath *dp; + int skip = cb->args[0]; + int i = 0; + + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { + if (i >= skip && + ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_DP_CMD_NEW) < 0) + break; + i++; + } + ovs_unlock(); + + cb->args[0] = i; + + return skb->len; +} + +static struct genl_ops dp_datapath_genl_ops[] = { + { .cmd = OVS_DP_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_new + }, + { .cmd = OVS_DP_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_del + }, + { .cmd = OVS_DP_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_get, + .dumpit = ovs_dp_cmd_dump + }, + { .cmd = OVS_DP_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_set, + }, +}; + +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { + [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, + [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, +}; + +static struct genl_family dp_vport_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_VPORT_FAMILY, + .version = OVS_VPORT_VERSION, + .maxattr = OVS_VPORT_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, +}; + +struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP +}; + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_vport_stats vport_stats; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = get_dpifindex(vport->dp); + + if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || + nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || + nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) || + nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid)) + goto nla_put_failure; + + ovs_vport_get_stats(vport, &vport_stats); + if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), + &vport_stats)) + goto nla_put_failure; + + err = ovs_vport_get_options(vport, skb); + if (err == -EMSGSIZE) + goto error; + + return genlmsg_end(skb, ovs_header); + +nla_put_failure: + err = -EMSGSIZE; +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +/* Called with ovs_mutex or RCU read lock. */ +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, + u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); + BUG_ON(retval < 0); + + return skb; +} + +/* Called with ovs_mutex or RCU read lock. */ +static struct vport *lookup_vport(struct net *net, + struct ovs_header *ovs_header, + struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) +{ + struct datapath *dp; + struct vport *vport; + + if (a[OVS_VPORT_ATTR_NAME]) { + vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); + if (!vport) + return ERR_PTR(-ENODEV); + if (ovs_header->dp_ifindex && + ovs_header->dp_ifindex != get_dpifindex(vport->dp)) + return ERR_PTR(-ENODEV); + return vport; + } else if (a[OVS_VPORT_ATTR_PORT_NO]) { + u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); + + if (port_no >= DP_MAX_PORTS) + return ERR_PTR(-EFBIG); + + dp = get_dp(net, ovs_header->dp_ifindex); + if (!dp) + return ERR_PTR(-ENODEV); + + vport = ovs_vport_ovsl_rcu(dp, port_no); + if (!vport) + return ERR_PTR(-ENODEV); + return vport; + } else + return ERR_PTR(-EINVAL); +} + +static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct vport_parms parms; + struct sk_buff *reply; + struct vport *vport; + struct datapath *dp; + u32 port_no; + int err; + + err = -EINVAL; + if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || + !a[OVS_VPORT_ATTR_UPCALL_PID]) + goto exit; + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto exit_unlock; + + if (a[OVS_VPORT_ATTR_PORT_NO]) { + port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); + + err = -EFBIG; + if (port_no >= DP_MAX_PORTS) + goto exit_unlock; + + vport = ovs_vport_ovsl(dp, port_no); + err = -EBUSY; + if (vport) + goto exit_unlock; + } else { + for (port_no = 1; ; port_no++) { + if (port_no >= DP_MAX_PORTS) { + err = -EFBIG; + goto exit_unlock; + } + vport = ovs_vport_ovsl(dp, port_no); + if (!vport) + break; + } + } + + parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); + parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); + parms.options = a[OVS_VPORT_ATTR_OPTIONS]; + parms.dp = dp; + parms.port_no = port_no; + parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + + vport = new_vport(&parms); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + err = 0; + reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, + OVS_VPORT_CMD_NEW); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + ovs_dp_detach_port(vport); + goto exit_unlock; + } + + ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + +exit_unlock: + ovs_unlock(); +exit: + return err; +} + +static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + ovs_lock(); + vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + err = 0; + if (a[OVS_VPORT_ATTR_TYPE] && + nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) + err = -EINVAL; + + reply = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!reply) { + err = -ENOMEM; + goto exit_unlock; + } + + if (!err && a[OVS_VPORT_ATTR_OPTIONS]) + err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); + if (err) + goto exit_free; + + if (a[OVS_VPORT_ATTR_UPCALL_PID]) + vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + + ovs_unlock(); + ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + return 0; + + rtnl_unlock(); + return 0; + +exit_free: + kfree_skb(reply); +exit_unlock: + ovs_unlock(); + return err; +} + +static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + ovs_lock(); + vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + if (vport->port_no == OVSP_LOCAL) { + err = -EINVAL; + goto exit_unlock; + } + + reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, + OVS_VPORT_CMD_DEL); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto exit_unlock; + + err = 0; + ovs_dp_detach_port(vport); + + ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + +exit_unlock: + ovs_unlock(); + return err; +} + +static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sk_buff *reply; + struct vport *vport; + int err; + + rcu_read_lock(); + vport = lookup_vport(sock_net(skb->sk), ovs_header, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock; + + reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, + OVS_VPORT_CMD_NEW); + err = PTR_ERR(reply); + if (IS_ERR(reply)) + goto exit_unlock; + + rcu_read_unlock(); + + return genlmsg_reply(reply, info); + +exit_unlock: + rcu_read_unlock(); + return err; +} + +static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct datapath *dp; + int bucket = cb->args[0], skip = cb->args[1]; + int i, j = 0; + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) + return -ENODEV; + + rcu_read_lock(); + for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + j = 0; + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + if (j >= skip && + ovs_vport_cmd_fill_info(vport, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + OVS_VPORT_CMD_NEW) < 0) + goto out; + + j++; + } + skip = 0; + } +out: + rcu_read_unlock(); + + cb->args[0] = i; + cb->args[1] = j; + + return skb->len; +} + +static struct genl_ops dp_vport_genl_ops[] = { + { .cmd = OVS_VPORT_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_new + }, + { .cmd = OVS_VPORT_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_del + }, + { .cmd = OVS_VPORT_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_get, + .dumpit = ovs_vport_cmd_dump + }, + { .cmd = OVS_VPORT_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_set, + }, +}; + +struct genl_family_and_ops { + struct genl_family *family; + struct genl_ops *ops; + int n_ops; + struct genl_multicast_group *group; +}; + +static const struct genl_family_and_ops dp_genl_families[] = { + { &dp_datapath_genl_family, + dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops), + &ovs_dp_datapath_multicast_group }, + { &dp_vport_genl_family, + dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops), + &ovs_dp_vport_multicast_group }, + { &dp_flow_genl_family, + dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops), + &ovs_dp_flow_multicast_group }, + { &dp_packet_genl_family, + dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), + NULL }, +}; + +static void dp_unregister_genl(int n_families) +{ + int i; + + for (i = 0; i < n_families; i++) + genl_unregister_family(dp_genl_families[i].family); +} + +static int dp_register_genl(void) +{ + int n_registered; + int err; + int i; + + n_registered = 0; + for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { + const struct genl_family_and_ops *f = &dp_genl_families[i]; + + err = genl_register_family_with_ops(f->family, f->ops, + f->n_ops); + if (err) + goto error; + n_registered++; + + if (f->group) { + err = genl_register_mc_group(f->family, f->group); + if (err) + goto error; + } + } + + return 0; + +error: + dp_unregister_genl(n_registered); + return err; +} + +static void rehash_flow_table(struct work_struct *work) +{ + struct datapath *dp; + struct net *net; + + ovs_lock(); + rtnl_lock(); + for_each_net(net) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + list_for_each_entry(dp, &ovs_net->dps, list_node) { + struct flow_table *old_table = ovsl_dereference(dp->table); + struct flow_table *new_table; + + new_table = ovs_flow_tbl_rehash(old_table); + if (!IS_ERR(new_table)) { + rcu_assign_pointer(dp->table, new_table); + ovs_flow_tbl_deferred_destroy(old_table); + } + } + } + rtnl_unlock(); + ovs_unlock(); + schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); +} + +static int __net_init ovs_init_net(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + INIT_LIST_HEAD(&ovs_net->dps); + INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + return 0; +} + +static void __net_exit ovs_exit_net(struct net *net) +{ + struct datapath *dp, *dp_next; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + ovs_lock(); + list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) + __dp_destroy(dp); + ovs_unlock(); + + cancel_work_sync(&ovs_net->dp_notify_work); +} + +static struct pernet_operations ovs_net_ops = { + .init = ovs_init_net, + .exit = ovs_exit_net, + .id = &ovs_net_id, + .size = sizeof(struct ovs_net), +}; + +static int __init dp_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + + pr_info("Open vSwitch switching datapath\n"); + + err = ovs_flow_init(); + if (err) + goto error; + + err = ovs_vport_init(); + if (err) + goto error_flow_exit; + + err = register_pernet_device(&ovs_net_ops); + if (err) + goto error_vport_exit; + + err = register_netdevice_notifier(&ovs_dp_device_notifier); + if (err) + goto error_netns_exit; + + err = dp_register_genl(); + if (err < 0) + goto error_unreg_notifier; + + schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); + + return 0; + +error_unreg_notifier: + unregister_netdevice_notifier(&ovs_dp_device_notifier); +error_netns_exit: + unregister_pernet_device(&ovs_net_ops); +error_vport_exit: + ovs_vport_exit(); +error_flow_exit: + ovs_flow_exit(); +error: + return err; +} + +static void dp_cleanup(void) +{ + cancel_delayed_work_sync(&rehash_flow_wq); + dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); + unregister_netdevice_notifier(&ovs_dp_device_notifier); + unregister_pernet_device(&ovs_net_ops); + rcu_barrier(); + ovs_vport_exit(); + ovs_flow_exit(); +} + +module_init(dp_init); +module_exit(dp_cleanup); + +MODULE_DESCRIPTION("Open vSwitch switching datapath"); +MODULE_LICENSE("GPL"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h new file mode 100644 index 000000000..16b840695 --- /dev/null +++ b/net/openvswitch/datapath.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef DATAPATH_H +#define DATAPATH_H 1 + +#include +#include +#include +#include +#include +#include + +#include "flow.h" +#include "vport.h" + +#define DP_MAX_PORTS USHRT_MAX +#define DP_VPORT_HASH_BUCKETS 1024 + +#define SAMPLE_ACTION_DEPTH 3 + +/** + * struct dp_stats_percpu - per-cpu packet processing statistics for a given + * datapath. + * @n_hit: Number of received packets for which a matching flow was found in + * the flow table. + * @n_miss: Number of received packets that had no matching flow in the flow + * table. The sum of @n_hit and @n_miss is the number of packets that have + * been received by the datapath. + * @n_lost: Number of received packets that had no matching flow in the flow + * table that could not be sent to userspace (normally due to an overflow in + * one of the datapath's queues). + */ +struct dp_stats_percpu { + u64 n_hit; + u64 n_missed; + u64 n_lost; + struct u64_stats_sync sync; +}; + +/** + * struct datapath - datapath for flow-based packet switching + * @rcu: RCU callback head for deferred destruction. + * @list_node: Element in global 'dps' list. + * @table: Current flow table. Protected by ovs_mutex and RCU. + * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by + * ovs_mutex and RCU. + * @stats_percpu: Per-CPU datapath statistics. + * @net: Reference to net namespace. + * + * Context: See the comment on locking at the top of datapath.c for additional + * locking information. + */ +struct datapath { + struct rcu_head rcu; + struct list_head list_node; + + /* Flow table. */ + struct flow_table __rcu *table; + + /* Switch ports. */ + struct hlist_head *ports; + + /* Stats. */ + struct dp_stats_percpu __percpu *stats_percpu; + +#ifdef CONFIG_NET_NS + /* Network namespace ref. */ + struct net *net; +#endif +}; + +/** + * struct ovs_skb_cb - OVS data in skb CB + * @flow: The flow associated with this packet. May be %NULL if no flow. + */ +struct ovs_skb_cb { + struct sw_flow *flow; +}; +#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) + +/** + * struct dp_upcall - metadata to include with a packet to send to userspace + * @cmd: One of %OVS_PACKET_CMD_*. + * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull. + * @userdata: If nonnull, its variable-length value is passed to userspace as + * %OVS_PACKET_ATTR_USERDATA. + * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no + * packet is sent and the packet is accounted in the datapath's @n_lost + * counter. + */ +struct dp_upcall_info { + u8 cmd; + const struct sw_flow_key *key; + const struct nlattr *userdata; + u32 portid; +}; + +/** + * struct ovs_net - Per net-namespace data for ovs. + * @dps: List of datapaths to enable dumping them all out. + * Protected by genl_mutex. + */ +struct ovs_net { + struct list_head dps; + struct work_struct dp_notify_work; +}; + +extern int ovs_net_id; +void ovs_lock(void); +void ovs_unlock(void); + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void); +#else +#define lockdep_ovsl_is_held() 1 +#endif + +#define ASSERT_OVSL() WARN_ON(unlikely(!lockdep_ovsl_is_held())) +#define ovsl_dereference(p) \ + rcu_dereference_protected(p, lockdep_ovsl_is_held()) + +static inline struct net *ovs_dp_get_net(struct datapath *dp) +{ + return read_pnet(&dp->net); +} + +static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) +{ + write_pnet(&dp->net, net); +} + +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); + +static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) +{ + ASSERT_OVSL(); + return ovs_lookup_vport(dp, port_no); +} + +extern struct notifier_block ovs_dp_device_notifier; +extern struct genl_multicast_group ovs_dp_vport_multicast_group; + +void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +void ovs_dp_detach_port(struct vport *); +int ovs_dp_upcall(struct datapath *, struct sk_buff *, + const struct dp_upcall_info *); + +const char *ovs_dp_name(const struct datapath *dp); +struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, + u8 cmd); + +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); +void ovs_dp_notify_wq(struct work_struct *work); +#endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c new file mode 100644 index 000000000..ef4feec6c --- /dev/null +++ b/net/openvswitch/dp_notify.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +static void dp_detach_port_notify(struct vport *vport) +{ + struct sk_buff *notify; + struct datapath *dp; + + dp = vport->dp; + notify = ovs_vport_cmd_build_info(vport, 0, 0, + OVS_VPORT_CMD_DEL); + ovs_dp_detach_port(vport); + if (IS_ERR(notify)) { + netlink_set_err(ovs_dp_get_net(dp)->genl_sock, 0, + ovs_dp_vport_multicast_group.id, + PTR_ERR(notify)); + return; + } + + genlmsg_multicast_netns(ovs_dp_get_net(dp), notify, 0, + ovs_dp_vport_multicast_group.id, + GFP_KERNEL); +} + +void ovs_dp_notify_wq(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); + struct datapath *dp; + + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + struct hlist_node *n; + + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { + struct netdev_vport *netdev_vport; + + if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + continue; + + netdev_vport = netdev_vport_priv(vport); + if (netdev_vport->dev->reg_state == NETREG_UNREGISTERED || + netdev_vport->dev->reg_state == NETREG_UNREGISTERING) + dp_detach_port_notify(vport); + } + } + } + ovs_unlock(); +} + +static int dp_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct ovs_net *ovs_net; + struct net_device *dev = ptr; + struct vport *vport = NULL; + + if (!ovs_is_internal_dev(dev)) + vport = ovs_netdev_get_vport(dev); + + if (!vport) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + ovs_net = net_generic(dev_net(dev), ovs_net_id); + queue_work(system_wq, &ovs_net->dp_notify_work); + } + + return NOTIFY_DONE; +} + +struct notifier_block ovs_dp_device_notifier = { + .notifier_call = dp_device_event +}; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c new file mode 100644 index 000000000..b15321a22 --- /dev/null +++ b/net/openvswitch/flow.c @@ -0,0 +1,1357 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *flow_cache; + +static int check_header(struct sk_buff *skb, int len) +{ + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + return 0; +} + +static bool arphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_network_offset(skb) + + sizeof(struct arp_eth_header)); +} + +static int check_iphdr(struct sk_buff *skb) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int ip_len; + int err; + + err = check_header(skb, nh_ofs + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + ip_len = ip_hdrlen(skb); + if (unlikely(ip_len < sizeof(struct iphdr) || + skb->len < nh_ofs + ip_len)) + return -EINVAL; + + skb_set_transport_header(skb, nh_ofs + ip_len); + return 0; +} + +static bool tcphdr_ok(struct sk_buff *skb) +{ + int th_ofs = skb_transport_offset(skb); + int tcp_len; + + if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr)))) + return false; + + tcp_len = tcp_hdrlen(skb); + if (unlikely(tcp_len < sizeof(struct tcphdr) || + skb->len < th_ofs + tcp_len)) + return false; + + return true; +} + +static bool udphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); +} + +static bool icmphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmphdr)); +} + +u64 ovs_flow_used_time(unsigned long flow_jiffies) +{ + struct timespec cur_ts; + u64 cur_ms, idle_ms; + + ktime_get_ts(&cur_ts); + idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); + cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ts.tv_nsec / NSEC_PER_MSEC; + + return cur_ms - idle_ms; +} + +#define SW_FLOW_KEY_OFFSET(field) \ + (offsetof(struct sw_flow_key, field) + \ + FIELD_SIZEOF(struct sw_flow_key, field)) + +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, + int *key_lenp) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int nh_len; + int payload_ofs; + struct ipv6hdr *nh; + uint8_t nexthdr; + __be16 frag_off; + int err; + + *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label); + + err = check_header(skb, nh_ofs + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + nexthdr = nh->nexthdr; + payload_ofs = (u8 *)(nh + 1) - skb->data; + + key->ip.proto = NEXTHDR_NONE; + key->ip.tos = ipv6_get_dsfield(nh); + key->ip.ttl = nh->hop_limit; + key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + key->ipv6.addr.src = nh->saddr; + key->ipv6.addr.dst = nh->daddr; + + payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); + if (unlikely(payload_ofs < 0)) + return -EINVAL; + + if (frag_off) { + if (frag_off & htons(~0x7)) + key->ip.frag = OVS_FRAG_TYPE_LATER; + else + key->ip.frag = OVS_FRAG_TYPE_FIRST; + } + + nh_len = payload_ofs - nh_ofs; + skb_set_transport_header(skb, nh_ofs + nh_len); + key->ip.proto = nexthdr; + return nh_len; +} + +static bool icmp6hdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmp6hdr)); +} + +#define TCP_FLAGS_OFFSET 13 +#define TCP_FLAG_MASK 0x3f + +void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) +{ + u8 tcp_flags = 0; + + if ((flow->key.eth.type == htons(ETH_P_IP) || + flow->key.eth.type == htons(ETH_P_IPV6)) && + flow->key.ip.proto == IPPROTO_TCP && + likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { + u8 *tcp = (u8 *)tcp_hdr(skb); + tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; + } + + spin_lock(&flow->lock); + flow->used = jiffies; + flow->packet_count++; + flow->byte_count += skb->len; + flow->tcp_flags |= tcp_flags; + spin_unlock(&flow->lock); +} + +struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions) +{ + int actions_len = nla_len(actions); + struct sw_flow_actions *sfa; + + if (actions_len > MAX_ACTIONS_BUFSIZE) + return ERR_PTR(-EINVAL); + + sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->actions_len = actions_len; + nla_memcpy(sfa->actions, actions, actions_len); + return sfa; +} + +struct sw_flow *ovs_flow_alloc(void) +{ + struct sw_flow *flow; + + flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&flow->lock); + flow->sf_acts = NULL; + + return flow; +} + +static struct hlist_head *find_bucket(struct flow_table *table, u32 hash) +{ + hash = jhash_1word(hash, table->hash_seed); + return flex_array_get(table->buckets, + (hash & (table->n_buckets - 1))); +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head *), + n_buckets, GFP_KERNEL); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + +struct flow_table *ovs_flow_tbl_alloc(int new_size) +{ + struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); + + if (!table) + return NULL; + + table->buckets = alloc_buckets(new_size); + + if (!table->buckets) { + kfree(table); + return NULL; + } + table->n_buckets = new_size; + table->count = 0; + table->node_ver = 0; + table->keep_flows = false; + get_random_bytes(&table->hash_seed, sizeof(u32)); + + return table; +} + +void ovs_flow_tbl_destroy(struct flow_table *table) +{ + int i; + + if (!table) + return; + + if (table->keep_flows) + goto skip_flows; + + for (i = 0; i < table->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(table->buckets, i); + struct hlist_node *n; + int ver = table->node_ver; + + hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { + hlist_del_rcu(&flow->hash_node[ver]); + ovs_flow_free(flow); + } + } + +skip_flows: + free_buckets(table->buckets); + kfree(table); +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct flow_table *table = container_of(rcu, struct flow_table, rcu); + + ovs_flow_tbl_destroy(table); +} + +void ovs_flow_tbl_deferred_destroy(struct flow_table *table) +{ + if (!table) + return; + + call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); +} + +struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = table->node_ver; + while (*bucket < table->n_buckets) { + i = 0; + head = flex_array_get(table->buckets, *bucket); + hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head; + + head = flex_array_get(old->buckets, i); + + hlist_for_each_entry(flow, head, hash_node[old_ver]) + ovs_flow_tbl_insert(new, flow); + } + old->keep_flows = true; +} + +static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets) +{ + struct flow_table *new_table; + + new_table = ovs_flow_tbl_alloc(n_buckets); + if (!new_table) + return ERR_PTR(-ENOMEM); + + flow_table_copy_flows(table, new_table); + + return new_table; +} + +struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table) +{ + return __flow_tbl_rehash(table, table->n_buckets); +} + +struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) +{ + return __flow_tbl_rehash(table, table->n_buckets * 2); +} + +void ovs_flow_free(struct sw_flow *flow) +{ + if (unlikely(!flow)) + return; + + kfree((struct sf_flow_acts __force *)flow->sf_acts); + kmem_cache_free(flow_cache, flow); +} + +/* RCU callback used by ovs_flow_deferred_free. */ +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + + ovs_flow_free(flow); +} + +/* Schedules 'flow' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_flow_deferred_free(struct sw_flow *flow) +{ + call_rcu(&flow->rcu, rcu_free_flow_callback); +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts) +{ + kfree_rcu(sf_acts, rcu); +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct qtag_prefix { + __be16 eth_type; /* ETH_P_8021Q */ + __be16 tci; + }; + struct qtag_prefix *qp; + + if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) + return 0; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + + sizeof(__be16)))) + return -ENOMEM; + + qp = (struct qtag_prefix *) skb->data; + key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); + __skb_pull(skb, sizeof(struct qtag_prefix)); + + return 0; +} + +static __be16 parse_ethertype(struct sk_buff *skb) +{ + struct llc_snap_hdr { + u8 dsap; /* Always 0xAA */ + u8 ssap; /* Always 0xAA */ + u8 ctrl; + u8 oui[3]; + __be16 ethertype; + }; + struct llc_snap_hdr *llc; + __be16 proto; + + proto = *(__be16 *) skb->data; + __skb_pull(skb, sizeof(__be16)); + + if (ntohs(proto) >= ETH_P_802_3_MIN) + return proto; + + if (skb->len < sizeof(struct llc_snap_hdr)) + return htons(ETH_P_802_2); + + if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr)))) + return htons(0); + + llc = (struct llc_snap_hdr *) skb->data; + if (llc->dsap != LLC_SAP_SNAP || + llc->ssap != LLC_SAP_SNAP || + (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) + return htons(ETH_P_802_2); + + __skb_pull(skb, sizeof(struct llc_snap_hdr)); + + if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + return llc->ethertype; + + return htons(ETH_P_802_2); +} + +static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, + int *key_lenp, int nh_len) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + int error = 0; + int key_len; + + /* The ICMPv6 type and code fields use the 16-bit transport port + * fields, so we need to store them in 16-bit network byte order. + */ + key->ipv6.tp.src = htons(icmp->icmp6_type); + key->ipv6.tp.dst = htons(icmp->icmp6_code); + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + + if (icmp->icmp6_code == 0 && + (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) { + int icmp_len = skb->len - skb_transport_offset(skb); + struct nd_msg *nd; + int offset; + + key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); + + /* In order to process neighbor discovery options, we need the + * entire packet. + */ + if (unlikely(icmp_len < sizeof(*nd))) + goto out; + if (unlikely(skb_linearize(skb))) { + error = -ENOMEM; + goto out; + } + + nd = (struct nd_msg *)skb_transport_header(skb); + key->ipv6.nd.target = nd->target; + key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); + + icmp_len -= sizeof(*nd); + offset = 0; + while (icmp_len >= 8) { + struct nd_opt_hdr *nd_opt = + (struct nd_opt_hdr *)(nd->opt + offset); + int opt_len = nd_opt->nd_opt_len * 8; + + if (unlikely(!opt_len || opt_len > icmp_len)) + goto invalid; + + /* Store the link layer address if the appropriate + * option is provided. It is considered an error if + * the same link layer option is specified twice. + */ + if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) + goto invalid; + memcpy(key->ipv6.nd.sll, + &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) + goto invalid; + memcpy(key->ipv6.nd.tll, + &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + } + + icmp_len -= opt_len; + offset += opt_len; + } + } + + goto out; + +invalid: + memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); + memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); + memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); + +out: + *key_lenp = key_len; + return error; +} + +/** + * ovs_flow_extract - extracts a flow key from an Ethernet frame. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * Ethernet header + * @in_port: port number on which @skb was received. + * @key: output flow key + * @key_lenp: length of output flow key + * + * The caller must ensure that skb->len >= ETH_HLEN. + * + * Returns 0 if successful, otherwise a negative errno value. + * + * Initializes @skb header pointers as follows: + * + * - skb->mac_header: the Ethernet header. + * + * - skb->network_header: just past the Ethernet header, or just past the + * VLAN header, to the first byte of the Ethernet payload. + * + * - skb->transport_header: If key->dl_type is ETH_P_IP or ETH_P_IPV6 + * on output, then just past the IP header, if one is present and + * of a correct length, otherwise the same as skb->network_header. + * For other key->dl_type values it is left untouched. + */ +int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, + int *key_lenp) +{ + int error = 0; + int key_len = SW_FLOW_KEY_OFFSET(eth); + struct ethhdr *eth; + + memset(key, 0, sizeof(*key)); + + key->phy.priority = skb->priority; + key->phy.in_port = in_port; + key->phy.skb_mark = skb->mark; + + skb_reset_mac_header(skb); + + /* Link layer. We are guaranteed to have at least the 14 byte Ethernet + * header in the linear data area. + */ + eth = eth_hdr(skb); + memcpy(key->eth.src, eth->h_source, ETH_ALEN); + memcpy(key->eth.dst, eth->h_dest, ETH_ALEN); + + __skb_pull(skb, 2 * ETH_ALEN); + + if (vlan_tx_tag_present(skb)) + key->eth.tci = htons(skb->vlan_tci); + else if (eth->h_proto == htons(ETH_P_8021Q)) + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; + + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) + return -ENOMEM; + + skb_reset_network_header(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); + + /* Network layer. */ + if (key->eth.type == htons(ETH_P_IP)) { + struct iphdr *nh; + __be16 offset; + + key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); + + error = check_iphdr(skb); + if (unlikely(error)) { + if (error == -EINVAL) { + skb->transport_header = skb->network_header; + error = 0; + } + goto out; + } + + nh = ip_hdr(skb); + key->ipv4.addr.src = nh->saddr; + key->ipv4.addr.dst = nh->daddr; + + key->ip.proto = nh->protocol; + key->ip.tos = nh->tos; + key->ip.ttl = nh->ttl; + + offset = nh->frag_off & htons(IP_OFFSET); + if (offset) { + key->ip.frag = OVS_FRAG_TYPE_LATER; + goto out; + } + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + + /* Transport layer. */ + if (key->ip.proto == IPPROTO_TCP) { + key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->ipv4.tp.src = tcp->source; + key->ipv4.tp.dst = tcp->dest; + } + } else if (key->ip.proto == IPPROTO_UDP) { + key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->ipv4.tp.src = udp->source; + key->ipv4.tp.dst = udp->dest; + } + } else if (key->ip.proto == IPPROTO_ICMP) { + key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + if (icmphdr_ok(skb)) { + struct icmphdr *icmp = icmp_hdr(skb); + /* The ICMP type and code fields use the 16-bit + * transport port fields, so we need to store + * them in 16-bit network byte order. */ + key->ipv4.tp.src = htons(icmp->type); + key->ipv4.tp.dst = htons(icmp->code); + } + } + + } else if ((key->eth.type == htons(ETH_P_ARP) || + key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) { + struct arp_eth_header *arp; + + arp = (struct arp_eth_header *)skb_network_header(skb); + + if (arp->ar_hrd == htons(ARPHRD_ETHER) + && arp->ar_pro == htons(ETH_P_IP) + && arp->ar_hln == ETH_ALEN + && arp->ar_pln == 4) { + + /* We only match on the lower 8 bits of the opcode. */ + if (ntohs(arp->ar_op) <= 0xff) + key->ip.proto = ntohs(arp->ar_op); + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); + memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); + memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); + memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); + key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); + } + } else if (key->eth.type == htons(ETH_P_IPV6)) { + int nh_len; /* IPv6 Header + Extensions */ + + nh_len = parse_ipv6hdr(skb, key, &key_len); + if (unlikely(nh_len < 0)) { + if (nh_len == -EINVAL) + skb->transport_header = skb->network_header; + else + error = nh_len; + goto out; + } + + if (key->ip.frag == OVS_FRAG_TYPE_LATER) + goto out; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + + /* Transport layer. */ + if (key->ip.proto == NEXTHDR_TCP) { + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->ipv6.tp.src = tcp->source; + key->ipv6.tp.dst = tcp->dest; + } + } else if (key->ip.proto == NEXTHDR_UDP) { + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->ipv6.tp.src = udp->source; + key->ipv6.tp.dst = udp->dest; + } + } else if (key->ip.proto == NEXTHDR_ICMP) { + key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + if (icmp6hdr_ok(skb)) { + error = parse_icmpv6(skb, key, &key_len, nh_len); + if (error < 0) + goto out; + } + } + } + +out: + *key_lenp = key_len; + return error; +} + +u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len) +{ + return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0); +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, + struct sw_flow_key *key, int key_len) +{ + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + + hash = ovs_flow_hash(key, key_len); + + head = find_bucket(table, hash); + hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { + + if (flow->hash == hash && + !memcmp(&flow->key, key, key_len)) { + return flow; + } + } + return NULL; +} + +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(table, flow->hash); + hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); + table->count++; +} + +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + BUG_ON(table->count == 0); + hlist_del_rcu(&flow->hash_node[table->node_ver]); + table->count--; +} + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ +const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = -1, + [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), + [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), + [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), + [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), + [OVS_KEY_ATTR_VLAN] = sizeof(__be16), + [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), + [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), + [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), + [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), + [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), + [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), + [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), + [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), + [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), +}; + +static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, + const struct nlattr *a[], u32 *attrs) +{ + const struct ovs_key_icmp *icmp_key; + const struct ovs_key_tcp *tcp_key; + const struct ovs_key_udp *udp_key; + + switch (swkey->ip.proto) { + case IPPROTO_TCP: + if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TCP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + swkey->ipv4.tp.src = tcp_key->tcp_src; + swkey->ipv4.tp.dst = tcp_key->tcp_dst; + break; + + case IPPROTO_UDP: + if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_UDP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + swkey->ipv4.tp.src = udp_key->udp_src; + swkey->ipv4.tp.dst = udp_key->udp_dst; + break; + + case IPPROTO_ICMP: + if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + swkey->ipv4.tp.src = htons(icmp_key->icmp_type); + swkey->ipv4.tp.dst = htons(icmp_key->icmp_code); + break; + } + + return 0; +} + +static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, + const struct nlattr *a[], u32 *attrs) +{ + const struct ovs_key_icmpv6 *icmpv6_key; + const struct ovs_key_tcp *tcp_key; + const struct ovs_key_udp *udp_key; + + switch (swkey->ip.proto) { + case IPPROTO_TCP: + if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TCP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + swkey->ipv6.tp.src = tcp_key->tcp_src; + swkey->ipv6.tp.dst = tcp_key->tcp_dst; + break; + + case IPPROTO_UDP: + if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_UDP); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + swkey->ipv6.tp.src = udp_key->udp_src; + swkey->ipv6.tp.dst = udp_key->udp_dst; + break; + + case IPPROTO_ICMPV6: + if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type); + swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code); + + if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + const struct ovs_key_nd *nd_key; + + if (!(*attrs & (1 << OVS_KEY_ATTR_ND))) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_ND); + + *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + memcpy(&swkey->ipv6.nd.target, nd_key->nd_target, + sizeof(swkey->ipv6.nd.target)); + memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN); + memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN); + } + break; + } + + return 0; +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u32 *attrsp) +{ + const struct nlattr *nla; + u32 attrs; + int rem; + + attrs = 0; + nla_for_each_nested(nla, attr, rem) { + u16 type = nla_type(nla); + int expected_len; + + if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type)) + return -EINVAL; + + expected_len = ovs_key_lens[type]; + if (nla_len(nla) != expected_len && expected_len != -1) + return -EINVAL; + + attrs |= 1 << type; + a[type] = nla; + } + if (rem) + return -EINVAL; + + *attrsp = attrs; + return 0; +} + +/** + * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. + * @swkey: receives the extracted flow key. + * @key_lenp: number of bytes used in @swkey. + * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + */ +int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, + const struct nlattr *attr) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct ovs_key_ethernet *eth_key; + int key_len; + u32 attrs; + int err; + + memset(swkey, 0, sizeof(struct sw_flow_key)); + key_len = SW_FLOW_KEY_OFFSET(eth); + + err = parse_flow_nlattrs(attr, a, &attrs); + if (err) + return err; + + /* Metadata attributes. */ + if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]); + attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } + if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + if (in_port >= DP_MAX_PORTS) + return -EINVAL; + swkey->phy.in_port = in_port; + attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else { + swkey->phy.in_port = DP_MAX_PORTS; + } + if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + swkey->phy.skb_mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + } + + /* Data attributes. */ + if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN); + memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN); + + if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) && + nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) { + const struct nlattr *encap; + __be16 tci; + + if (attrs != ((1 << OVS_KEY_ATTR_VLAN) | + (1 << OVS_KEY_ATTR_ETHERTYPE) | + (1 << OVS_KEY_ATTR_ENCAP))) + return -EINVAL; + + encap = a[OVS_KEY_ATTR_ENCAP]; + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + if (tci & htons(VLAN_TAG_PRESENT)) { + swkey->eth.tci = tci; + + err = parse_flow_nlattrs(encap, a, &attrs); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) + return -EINVAL; + + swkey->eth.type = htons(ETH_P_8021Q); + *key_lenp = key_len; + return 0; + } else { + return -EINVAL; + } + } + + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + } else { + swkey->eth.type = htons(ETH_P_802_2); + } + + if (swkey->eth.type == htons(ETH_P_IP)) { + const struct ovs_key_ipv4 *ipv4_key; + + if (!(attrs & (1 << OVS_KEY_ATTR_IPV4))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + + key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); + ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); + if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) + return -EINVAL; + swkey->ip.proto = ipv4_key->ipv4_proto; + swkey->ip.tos = ipv4_key->ipv4_tos; + swkey->ip.ttl = ipv4_key->ipv4_ttl; + swkey->ip.frag = ipv4_key->ipv4_frag; + swkey->ipv4.addr.src = ipv4_key->ipv4_src; + swkey->ipv4.addr.dst = ipv4_key->ipv4_dst; + + if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs); + if (err) + return err; + } + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; + + if (!(attrs & (1 << OVS_KEY_ATTR_IPV6))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + + key_len = SW_FLOW_KEY_OFFSET(ipv6.label); + ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); + if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) + return -EINVAL; + swkey->ipv6.label = ipv6_key->ipv6_label; + swkey->ip.proto = ipv6_key->ipv6_proto; + swkey->ip.tos = ipv6_key->ipv6_tclass; + swkey->ip.ttl = ipv6_key->ipv6_hlimit; + swkey->ip.frag = ipv6_key->ipv6_frag; + memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src, + sizeof(swkey->ipv6.addr.src)); + memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst, + sizeof(swkey->ipv6.addr.dst)); + + if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs); + if (err) + return err; + } + } else if (swkey->eth.type == htons(ETH_P_ARP) || + swkey->eth.type == htons(ETH_P_RARP)) { + const struct ovs_key_arp *arp_key; + + if (!(attrs & (1 << OVS_KEY_ATTR_ARP))) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + + key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + swkey->ipv4.addr.src = arp_key->arp_sip; + swkey->ipv4.addr.dst = arp_key->arp_tip; + if (arp_key->arp_op & htons(0xff00)) + return -EINVAL; + swkey->ip.proto = ntohs(arp_key->arp_op); + memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); + memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); + } + + if (attrs) + return -EINVAL; + *key_lenp = key_len; + + return 0; +} + +/** + * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. + * @priority: receives the skb priority + * @mark: receives the skb mark + * @in_port: receives the extracted input port. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + */ +int ovs_flow_metadata_from_nlattrs(u32 *priority, u32 *mark, u16 *in_port, + const struct nlattr *attr) +{ + const struct nlattr *nla; + int rem; + + *in_port = DP_MAX_PORTS; + *priority = 0; + *mark = 0; + + nla_for_each_nested(nla, attr, rem) { + int type = nla_type(nla); + + if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { + if (nla_len(nla) != ovs_key_lens[type]) + return -EINVAL; + + switch (type) { + case OVS_KEY_ATTR_PRIORITY: + *priority = nla_get_u32(nla); + break; + + case OVS_KEY_ATTR_IN_PORT: + if (nla_get_u32(nla) >= DP_MAX_PORTS) + return -EINVAL; + *in_port = nla_get_u32(nla); + break; + + case OVS_KEY_ATTR_SKB_MARK: + *mark = nla_get_u32(nla); + break; + } + } + } + if (rem) + return -EINVAL; + return 0; +} + +int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) +{ + struct ovs_key_ethernet *eth_key; + struct nlattr *nla, *encap; + + if (swkey->phy.priority && + nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) + goto nla_put_failure; + + if (swkey->phy.in_port != DP_MAX_PORTS && + nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) + goto nla_put_failure; + + if (swkey->phy.skb_mark && + nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark)) + goto nla_put_failure; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) + goto nla_put_failure; + eth_key = nla_data(nla); + memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN); + memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN); + + if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci)) + goto nla_put_failure; + encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.tci) + goto unencap; + } else { + encap = NULL; + } + + if (swkey->eth.type == htons(ETH_P_802_2)) + goto unencap; + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type)) + goto nla_put_failure; + + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ipv4 *ipv4_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); + if (!nla) + goto nla_put_failure; + ipv4_key = nla_data(nla); + ipv4_key->ipv4_src = swkey->ipv4.addr.src; + ipv4_key->ipv4_dst = swkey->ipv4.addr.dst; + ipv4_key->ipv4_proto = swkey->ip.proto; + ipv4_key->ipv4_tos = swkey->ip.tos; + ipv4_key->ipv4_ttl = swkey->ip.ttl; + ipv4_key->ipv4_frag = swkey->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ipv6 *ipv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); + if (!nla) + goto nla_put_failure; + ipv6_key = nla_data(nla); + memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src, + sizeof(ipv6_key->ipv6_src)); + memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst, + sizeof(ipv6_key->ipv6_dst)); + ipv6_key->ipv6_label = swkey->ipv6.label; + ipv6_key->ipv6_proto = swkey->ip.proto; + ipv6_key->ipv6_tclass = swkey->ip.tos; + ipv6_key->ipv6_hlimit = swkey->ip.ttl; + ipv6_key->ipv6_frag = swkey->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_ARP) || + swkey->eth.type == htons(ETH_P_RARP)) { + struct ovs_key_arp *arp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); + if (!nla) + goto nla_put_failure; + arp_key = nla_data(nla); + memset(arp_key, 0, sizeof(struct ovs_key_arp)); + arp_key->arp_sip = swkey->ipv4.addr.src; + arp_key->arp_tip = swkey->ipv4.addr.dst; + arp_key->arp_op = htons(swkey->ip.proto); + memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); + memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); + } + + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + + if (swkey->ip.proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); + if (!nla) + goto nla_put_failure; + tcp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + tcp_key->tcp_src = swkey->ipv4.tp.src; + tcp_key->tcp_dst = swkey->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + tcp_key->tcp_src = swkey->ipv6.tp.src; + tcp_key->tcp_dst = swkey->ipv6.tp.dst; + } + } else if (swkey->ip.proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); + if (!nla) + goto nla_put_failure; + udp_key = nla_data(nla); + if (swkey->eth.type == htons(ETH_P_IP)) { + udp_key->udp_src = swkey->ipv4.tp.src; + udp_key->udp_dst = swkey->ipv4.tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + udp_key->udp_src = swkey->ipv6.tp.src; + udp_key->udp_dst = swkey->ipv6.tp.dst; + } + } else if (swkey->eth.type == htons(ETH_P_IP) && + swkey->ip.proto == IPPROTO_ICMP) { + struct ovs_key_icmp *icmp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); + if (!nla) + goto nla_put_failure; + icmp_key = nla_data(nla); + icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src); + icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst); + } else if (swkey->eth.type == htons(ETH_P_IPV6) && + swkey->ip.proto == IPPROTO_ICMPV6) { + struct ovs_key_icmpv6 *icmpv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, + sizeof(*icmpv6_key)); + if (!nla) + goto nla_put_failure; + icmpv6_key = nla_data(nla); + icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src); + icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst); + + if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + struct ovs_key_nd *nd_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); + if (!nla) + goto nla_put_failure; + nd_key = nla_data(nla); + memcpy(nd_key->nd_target, &swkey->ipv6.nd.target, + sizeof(nd_key->nd_target)); + memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN); + memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN); + } + } + } + +unencap: + if (encap) + nla_nest_end(skb, encap); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, + 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ + kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h new file mode 100644 index 000000000..0875fde65 --- /dev/null +++ b/net/openvswitch/flow.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_H +#define FLOW_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct sk_buff; + +struct sw_flow_actions { + struct rcu_head rcu; + u32 actions_len; + struct nlattr actions[]; +}; + +struct sw_flow_key { + struct { + u32 priority; /* Packet QoS priority. */ + u32 skb_mark; /* SKB mark. */ + u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ + } phy; + struct { + u8 src[ETH_ALEN]; /* Ethernet source address. */ + u8 dst[ETH_ALEN]; /* Ethernet destination address. */ + __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + __be16 type; /* Ethernet frame type. */ + } eth; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + union { + struct { + struct { + __be32 src; /* IP source address. */ + __be32 dst; /* IP destination address. */ + } addr; + union { + struct { + __be16 src; /* TCP/UDP source port. */ + __be16 dst; /* TCP/UDP destination port. */ + } tp; + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; + }; + } ipv4; + struct { + struct { + struct in6_addr src; /* IPv6 source address. */ + struct in6_addr dst; /* IPv6 destination address. */ + } addr; + __be32 label; /* IPv6 flow label. */ + struct { + __be16 src; /* TCP/UDP source port. */ + __be16 dst; /* TCP/UDP destination port. */ + } tp; + struct { + struct in6_addr target; /* ND target address. */ + u8 sll[ETH_ALEN]; /* ND source link layer address. */ + u8 tll[ETH_ALEN]; /* ND target link layer address. */ + } nd; + } ipv6; + }; +}; + +struct sw_flow { + struct rcu_head rcu; + struct hlist_node hash_node[2]; + u32 hash; + + struct sw_flow_key key; + struct sw_flow_actions __rcu *sf_acts; + + spinlock_t lock; /* Lock for values below. */ + unsigned long used; /* Last used time (in jiffies). */ + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + u8 tcp_flags; /* Union of seen TCP flags. */ +}; + +struct arp_eth_header { + __be16 ar_hrd; /* format of hardware address */ + __be16 ar_pro; /* format of protocol address */ + unsigned char ar_hln; /* length of hardware address */ + unsigned char ar_pln; /* length of protocol address */ + __be16 ar_op; /* ARP opcode (command) */ + + /* Ethernet+IPv4 specific members. */ + unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ + unsigned char ar_sip[4]; /* sender IP address */ + unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ + unsigned char ar_tip[4]; /* target IP address */ +} __packed; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_deferred_free(struct sw_flow *); +void ovs_flow_free(struct sw_flow *flow); + +struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *); +void ovs_flow_deferred_free_acts(struct sw_flow_actions *); + +int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, + int *key_lenp); +void ovs_flow_used(struct sw_flow *, struct sk_buff *); +u64 ovs_flow_used_time(unsigned long flow_jiffies); + +int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); +int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, + const struct nlattr *); +int ovs_flow_metadata_from_nlattrs(u32 *priority, u32 *mark, u16 *in_port, + const struct nlattr *); + +#define MAX_ACTIONS_BUFSIZE (16 * 1024) +#define TBL_MIN_BUCKETS 1024 + +struct flow_table { + struct flex_array *buckets; + unsigned int count, n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; + bool keep_flows; +}; + +static inline int ovs_flow_tbl_count(struct flow_table *table) +{ + return table->count; +} + +static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) +{ + return (table->count > table->n_buckets); +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, + struct sw_flow_key *key, int len); +void ovs_flow_tbl_destroy(struct flow_table *table); +void ovs_flow_tbl_deferred_destroy(struct flow_table *table); +struct flow_table *ovs_flow_tbl_alloc(int new_size); +struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); +struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); +void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len); + +struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); +extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; + +#endif /* flow.h */ diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c new file mode 100644 index 000000000..84e0a0379 --- /dev/null +++ b/net/openvswitch/vport-internal_dev.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +struct internal_dev { + struct vport *vport; +}; + +static struct internal_dev *internal_dev_priv(struct net_device *netdev) +{ + return netdev_priv(netdev); +} + +/* This function is only called by the kernel network layer.*/ +static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, + struct rtnl_link_stats64 *stats) +{ + struct vport *vport = ovs_internal_dev_get_vport(netdev); + struct ovs_vport_stats vport_stats; + + ovs_vport_get_stats(vport, &vport_stats); + + /* The tx and rx stats need to be swapped because the + * switch and host OS have opposite perspectives. */ + stats->rx_packets = vport_stats.tx_packets; + stats->tx_packets = vport_stats.rx_packets; + stats->rx_bytes = vport_stats.tx_bytes; + stats->tx_bytes = vport_stats.rx_bytes; + stats->rx_errors = vport_stats.tx_errors; + stats->tx_errors = vport_stats.rx_errors; + stats->rx_dropped = vport_stats.tx_dropped; + stats->tx_dropped = vport_stats.rx_dropped; + + return stats; +} + +/* Called with rcu_read_lock_bh. */ +static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + rcu_read_lock(); + ovs_vport_receive(internal_dev_priv(netdev)->vport, skb); + rcu_read_unlock(); + return 0; +} + +static int internal_dev_open(struct net_device *netdev) +{ + netif_start_queue(netdev); + return 0; +} + +static int internal_dev_stop(struct net_device *netdev) +{ + netif_stop_queue(netdev); + return 0; +} + +static void internal_dev_getinfo(struct net_device *netdev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, "openvswitch", sizeof(info->driver)); +} + +static const struct ethtool_ops internal_dev_ethtool_ops = { + .get_drvinfo = internal_dev_getinfo, + .get_link = ethtool_op_get_link, +}; + +static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) +{ + if (new_mtu < 68) + return -EINVAL; + + netdev->mtu = new_mtu; + return 0; +} + +static void internal_dev_destructor(struct net_device *dev) +{ + struct vport *vport = ovs_internal_dev_get_vport(dev); + + ovs_vport_free(vport); + free_netdev(dev); +} + +static const struct net_device_ops internal_dev_netdev_ops = { + .ndo_open = internal_dev_open, + .ndo_stop = internal_dev_stop, + .ndo_start_xmit = internal_dev_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_change_mtu = internal_dev_change_mtu, + .ndo_get_stats64 = internal_dev_get_stats, +}; + +static void do_setup(struct net_device *netdev) +{ + ether_setup(netdev); + + netdev->netdev_ops = &internal_dev_netdev_ops; + + netdev->priv_flags &= ~IFF_TX_SKB_SHARING; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->destructor = internal_dev_destructor; + SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops); + netdev->tx_queue_len = 0; + + netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO; + + netdev->vlan_features = netdev->features; + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + eth_hw_addr_random(netdev); +} + +static struct vport *internal_dev_create(const struct vport_parms *parms) +{ + struct vport *vport; + struct netdev_vport *netdev_vport; + struct internal_dev *internal_dev; + int err; + + vport = ovs_vport_alloc(sizeof(struct netdev_vport), + &ovs_internal_vport_ops, parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + netdev_vport = netdev_vport_priv(vport); + + netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, do_setup); + if (!netdev_vport->dev) { + err = -ENOMEM; + goto error_free_vport; + } + + dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(netdev_vport->dev); + internal_dev->vport = vport; + + /* Restrict bridge port to current netns. */ + if (vport->port_no == OVSP_LOCAL) + netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + + rtnl_lock(); + err = register_netdevice(netdev_vport->dev); + if (err) + goto error_free_netdev; + + dev_set_promiscuity(netdev_vport->dev, 1); + rtnl_unlock(); + netif_start_queue(netdev_vport->dev); + + return vport; + +error_free_netdev: + rtnl_unlock(); + free_netdev(netdev_vport->dev); +error_free_vport: + ovs_vport_free(vport); +error: + return ERR_PTR(err); +} + +static void internal_dev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + netif_stop_queue(netdev_vport->dev); + rtnl_lock(); + dev_set_promiscuity(netdev_vport->dev, -1); + + /* unregister_netdevice() waits for an RCU grace period. */ + unregister_netdevice(netdev_vport->dev); + + rtnl_unlock(); +} + +static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +{ + struct net_device *netdev = netdev_vport_priv(vport)->dev; + int len; + + len = skb->len; + + skb_dst_drop(skb); + nf_reset(skb); + secpath_reset(skb); + + skb->dev = netdev; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, netdev); + + netif_rx(skb); + + return len; +} + +const struct vport_ops ovs_internal_vport_ops = { + .type = OVS_VPORT_TYPE_INTERNAL, + .create = internal_dev_create, + .destroy = internal_dev_destroy, + .get_name = ovs_netdev_get_name, + .send = internal_dev_recv, +}; + +int ovs_is_internal_dev(const struct net_device *netdev) +{ + return netdev->netdev_ops == &internal_dev_netdev_ops; +} + +struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) +{ + if (!ovs_is_internal_dev(netdev)) + return NULL; + + return internal_dev_priv(netdev)->vport; +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h new file mode 100644 index 000000000..9a7d30ecc --- /dev/null +++ b/net/openvswitch/vport-internal_dev.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_INTERNAL_DEV_H +#define VPORT_INTERNAL_DEV_H 1 + +#include "datapath.h" +#include "vport.h" + +int ovs_is_internal_dev(const struct net_device *); +struct vport *ovs_internal_dev_get_vport(struct net_device *); + +#endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c new file mode 100644 index 000000000..4f01c6d2f --- /dev/null +++ b/net/openvswitch/vport-netdev.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +/* Must be called with rcu_read_lock. */ +static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +{ + if (unlikely(!vport)) + goto error; + + if (unlikely(skb_warn_if_lro(skb))) + goto error; + + /* Make our own copy of the packet. Otherwise we will mangle the + * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return; + + skb_push(skb, ETH_HLEN); + ovs_vport_receive(vport, skb); + return; + +error: + kfree_skb(skb); +} + +/* Called with rcu_read_lock and bottom-halves disabled. */ +static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct vport *vport; + + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + + vport = ovs_netdev_get_vport(skb->dev); + + netdev_port_receive(vport, skb); + + return RX_HANDLER_CONSUMED; +} + +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + struct netdev_vport *netdev_vport; + int err; + + vport = ovs_vport_alloc(sizeof(struct netdev_vport), + &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + netdev_vport = netdev_vport_priv(vport); + + netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); + if (!netdev_vport->dev) { + err = -ENODEV; + goto error_free_vport; + } + + if (netdev_vport->dev->flags & IFF_LOOPBACK || + netdev_vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(netdev_vport->dev)) { + err = -EINVAL; + goto error_put; + } + + rtnl_lock(); + err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + vport); + if (err) + goto error_unlock; + + dev_set_promiscuity(netdev_vport->dev, 1); + netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + rtnl_unlock(); + + return vport; + +error_unlock: + rtnl_unlock(); +error_put: + dev_put(netdev_vport->dev); +error_free_vport: + ovs_vport_free(vport); +error: + return ERR_PTR(err); +} + +static void free_port_rcu(struct rcu_head *rcu) +{ + struct netdev_vport *netdev_vport = container_of(rcu, + struct netdev_vport, rcu); + + dev_put(netdev_vport->dev); + ovs_vport_free(vport_from_priv(netdev_vport)); +} + +static void netdev_destroy(struct vport *vport) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + + rtnl_lock(); + netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(netdev_vport->dev); + dev_set_promiscuity(netdev_vport->dev, -1); + rtnl_unlock(); + + call_rcu(&netdev_vport->rcu, free_port_rcu); +} + +const char *ovs_netdev_get_name(const struct vport *vport) +{ + const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + return netdev_vport->dev->name; +} + +static unsigned int packet_length(const struct sk_buff *skb) +{ + unsigned int length = skb->len - ETH_HLEN; + + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; + + return length; +} + +static int netdev_send(struct vport *vport, struct sk_buff *skb) +{ + struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + int mtu = netdev_vport->dev->mtu; + int len; + + if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", + netdev_vport->dev->name, + packet_length(skb), mtu); + goto error; + } + + skb->dev = netdev_vport->dev; + len = skb->len; + dev_queue_xmit(skb); + + return len; + +error: + kfree_skb(skb); + ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); + return 0; +} + +/* Returns null if this device is not attached to a datapath. */ +struct vport *ovs_netdev_get_vport(struct net_device *dev) +{ + if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) + return (struct vport *) + rcu_dereference_rtnl(dev->rx_handler_data); + else + return NULL; +} + +const struct vport_ops ovs_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_NETDEV, + .create = netdev_create, + .destroy = netdev_destroy, + .get_name = ovs_netdev_get_name, + .send = netdev_send, +}; diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h new file mode 100644 index 000000000..a3cb3a32c --- /dev/null +++ b/net/openvswitch/vport-netdev.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_NETDEV_H +#define VPORT_NETDEV_H 1 + +#include +#include + +#include "vport.h" + +struct vport *ovs_netdev_get_vport(struct net_device *dev); + +struct netdev_vport { + struct rcu_head rcu; + + struct net_device *dev; +}; + +static inline struct netdev_vport * +netdev_vport_priv(const struct vport *vport) +{ + return vport_priv(vport); +} + +const char *ovs_netdev_get_name(const struct vport *); +const char *ovs_netdev_get_config(const struct vport *); + +#endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c new file mode 100644 index 000000000..720623190 --- /dev/null +++ b/net/openvswitch/vport.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" +#include "vport-internal_dev.h" + +/* List of statically compiled vport implementations. Don't forget to also + * add yours to the list at the bottom of vport.h. */ +static const struct vport_ops *vport_ops_list[] = { + &ovs_netdev_vport_ops, + &ovs_internal_vport_ops, +}; + +/* Protected by RCU read lock for reading, ovs_mutex for writing. */ +static struct hlist_head *dev_table; +#define VPORT_HASH_BUCKETS 1024 + +/** + * ovs_vport_init - initialize vport subsystem + * + * Called at module load time to initialize the vport subsystem. + */ +int ovs_vport_init(void) +{ + dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!dev_table) + return -ENOMEM; + + return 0; +} + +/** + * ovs_vport_exit - shutdown vport subsystem + * + * Called at module exit time to shutdown the vport subsystem. + */ +void ovs_vport_exit(void) +{ + kfree(dev_table); +} + +static struct hlist_head *hash_bucket(struct net *net, const char *name) +{ + unsigned int hash = jhash(name, strlen(name), (unsigned long) net); + return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; +} + +/** + * ovs_vport_locate - find a port that has already been created + * + * @name: name of port to find + * + * Must be called with ovs or RCU read lock. + */ +struct vport *ovs_vport_locate(struct net *net, const char *name) +{ + struct hlist_head *bucket = hash_bucket(net, name); + struct vport *vport; + + hlist_for_each_entry_rcu(vport, bucket, hash_node) + if (!strcmp(name, vport->ops->get_name(vport)) && + net_eq(ovs_dp_get_net(vport->dp), net)) + return vport; + + return NULL; +} + +/** + * ovs_vport_alloc - allocate and initialize new vport + * + * @priv_size: Size of private data area to allocate. + * @ops: vport device ops + * + * Allocate and initialize a new vport defined by @ops. The vport will contain + * a private data area of size @priv_size that can be accessed using + * vport_priv(). vports that are no longer needed should be released with + * vport_free(). + */ +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, + const struct vport_parms *parms) +{ + struct vport *vport; + size_t alloc_size; + + alloc_size = sizeof(struct vport); + if (priv_size) { + alloc_size = ALIGN(alloc_size, VPORT_ALIGN); + alloc_size += priv_size; + } + + vport = kzalloc(alloc_size, GFP_KERNEL); + if (!vport) + return ERR_PTR(-ENOMEM); + + vport->dp = parms->dp; + vport->port_no = parms->port_no; + vport->upcall_portid = parms->upcall_portid; + vport->ops = ops; + INIT_HLIST_NODE(&vport->dp_hash_node); + + vport->percpu_stats = alloc_percpu(struct pcpu_tstats); + if (!vport->percpu_stats) { + kfree(vport); + return ERR_PTR(-ENOMEM); + } + + spin_lock_init(&vport->stats_lock); + + return vport; +} + +/** + * ovs_vport_free - uninitialize and free vport + * + * @vport: vport to free + * + * Frees a vport allocated with vport_alloc() when it is no longer needed. + * + * The caller must ensure that an RCU grace period has passed since the last + * time @vport was in a datapath. + */ +void ovs_vport_free(struct vport *vport) +{ + free_percpu(vport->percpu_stats); + kfree(vport); +} + +/** + * ovs_vport_add - add vport device (for kernel callers) + * + * @parms: Information about new vport. + * + * Creates a new vport with the specified configuration (which is dependent on + * device type). ovs_mutex must be held. + */ +struct vport *ovs_vport_add(const struct vport_parms *parms) +{ + struct vport *vport; + int err = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { + if (vport_ops_list[i]->type == parms->type) { + struct hlist_head *bucket; + + vport = vport_ops_list[i]->create(parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto out; + } + + bucket = hash_bucket(ovs_dp_get_net(vport->dp), + vport->ops->get_name(vport)); + hlist_add_head_rcu(&vport->hash_node, bucket); + return vport; + } + } + + err = -EAFNOSUPPORT; + +out: + return ERR_PTR(err); +} + +/** + * ovs_vport_set_options - modify existing vport device (for kernel callers) + * + * @vport: vport to modify. + * @port: New configuration. + * + * Modifies an existing device with the specified configuration (which is + * dependent on device type). ovs_mutex must be held. + */ +int ovs_vport_set_options(struct vport *vport, struct nlattr *options) +{ + if (!vport->ops->set_options) + return -EOPNOTSUPP; + return vport->ops->set_options(vport, options); +} + +/** + * ovs_vport_del - delete existing vport device + * + * @vport: vport to delete. + * + * Detaches @vport from its datapath and destroys it. It is possible to fail + * for reasons such as lack of memory. ovs_mutex must be held. + */ +void ovs_vport_del(struct vport *vport) +{ + ASSERT_OVSL(); + + hlist_del_rcu(&vport->hash_node); + + vport->ops->destroy(vport); +} + +/** + * ovs_vport_get_stats - retrieve device stats + * + * @vport: vport from which to retrieve the stats + * @stats: location to store stats + * + * Retrieves transmit, receive, and error stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + + /* We potentially have 2 sources of stats that need to be combined: + * those we have collected (split into err_stats and percpu_stats) from + * set_stats() and device error stats from netdev->get_stats() (for + * errors that happen downstream and therefore aren't reported through + * our vport_record_error() function). + * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). + * netdev-stats can be directly read over netlink-ioctl. + */ + + spin_lock_bh(&vport->stats_lock); + + stats->rx_errors = vport->err_stats.rx_errors; + stats->tx_errors = vport->err_stats.tx_errors; + stats->tx_dropped = vport->err_stats.tx_dropped; + stats->rx_dropped = vport->err_stats.rx_dropped; + + spin_unlock_bh(&vport->stats_lock); + + for_each_possible_cpu(i) { + const struct pcpu_tstats *percpu_stats; + struct pcpu_tstats local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(vport->percpu_stats, i); + + do { + start = u64_stats_fetch_begin_bh(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_bh(&percpu_stats->syncp, start)); + + stats->rx_bytes += local_stats.rx_bytes; + stats->rx_packets += local_stats.rx_packets; + stats->tx_bytes += local_stats.tx_bytes; + stats->tx_packets += local_stats.tx_packets; + } +} + +/** + * ovs_vport_get_options - retrieve device options + * + * @vport: vport from which to retrieve the options. + * @skb: sk_buff where options should be appended. + * + * Retrieves the configuration of the given device, appending an + * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested + * vport-specific attributes to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another + * negative error code if a real error occurred. If an error occurs, @skb is + * left unmodified. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + int err; + + if (!vport->ops->get_options) + return 0; + + nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); + if (!nla) + return -EMSGSIZE; + + err = vport->ops->get_options(vport, skb); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } + + nla_nest_end(skb, nla); + return 0; +} + +/** + * ovs_vport_receive - pass up received packet to the datapath for processing + * + * @vport: vport that received the packet + * @skb: skb that was received + * + * Must be called with rcu_read_lock. The packet cannot be shared and + * skb->data should point to the Ethernet header. + */ +void ovs_vport_receive(struct vport *vport, struct sk_buff *skb) +{ + struct pcpu_tstats *stats; + + stats = this_cpu_ptr(vport->percpu_stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); + + ovs_dp_process_received_packet(vport, skb); +} + +/** + * ovs_vport_send - send a packet on a device + * + * @vport: vport on which to send the packet + * @skb: skb to send + * + * Sends the given packet and returns the length of data sent. Either ovs + * lock or rcu_read_lock must be held. + */ +int ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ + int sent = vport->ops->send(vport, skb); + + if (likely(sent)) { + struct pcpu_tstats *stats; + + stats = this_cpu_ptr(vport->percpu_stats); + + u64_stats_update_begin(&stats->syncp); + stats->tx_packets++; + stats->tx_bytes += sent; + u64_stats_update_end(&stats->syncp); + } + return sent; +} + +/** + * ovs_vport_record_error - indicate device error to generic stats layer + * + * @vport: vport that encountered the error + * @err_type: one of enum vport_err_type types to indicate the error type + * + * If using the vport generic stats layer indicate that an error of the given + * type has occured. + */ +void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) +{ + spin_lock(&vport->stats_lock); + + switch (err_type) { + case VPORT_E_RX_DROPPED: + vport->err_stats.rx_dropped++; + break; + + case VPORT_E_RX_ERROR: + vport->err_stats.rx_errors++; + break; + + case VPORT_E_TX_DROPPED: + vport->err_stats.tx_dropped++; + break; + + case VPORT_E_TX_ERROR: + vport->err_stats.tx_errors++; + break; + } + + spin_unlock(&vport->stats_lock); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h new file mode 100644 index 000000000..68a377bc0 --- /dev/null +++ b/net/openvswitch/vport.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_H +#define VPORT_H 1 + +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" + +struct vport; +struct vport_parms; + +/* The following definitions are for users of the vport subsytem: */ + +int ovs_vport_init(void); +void ovs_vport_exit(void); + +struct vport *ovs_vport_add(const struct vport_parms *); +void ovs_vport_del(struct vport *); + +struct vport *ovs_vport_locate(struct net *net, const char *name); + +void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); + +int ovs_vport_set_options(struct vport *, struct nlattr *options); +int ovs_vport_get_options(const struct vport *, struct sk_buff *); + +int ovs_vport_send(struct vport *, struct sk_buff *); + +/* The following definitions are for implementers of vport devices: */ + +struct vport_err_stats { + u64 rx_dropped; + u64 rx_errors; + u64 tx_dropped; + u64 tx_errors; +}; + +/** + * struct vport - one port within a datapath + * @rcu: RCU callback head for deferred destruction. + * @dp: Datapath to which this port belongs. + * @upcall_portid: The Netlink port to use for packets received on this port that + * miss the flow table. + * @port_no: Index into @dp's @ports array. + * @hash_node: Element in @dev_table hash table in vport.c. + * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. + * @ops: Class structure. + * @percpu_stats: Points to per-CPU statistics used and maintained by vport + * @stats_lock: Protects @err_stats; + * @err_stats: Points to error statistics used and maintained by vport + */ +struct vport { + struct rcu_head rcu; + struct datapath *dp; + u32 upcall_portid; + u16 port_no; + + struct hlist_node hash_node; + struct hlist_node dp_hash_node; + const struct vport_ops *ops; + + struct pcpu_tstats __percpu *percpu_stats; + + spinlock_t stats_lock; + struct vport_err_stats err_stats; +}; + +/** + * struct vport_parms - parameters for creating a new vport + * + * @name: New vport's name. + * @type: New vport's type. + * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if + * none was supplied. + * @dp: New vport's datapath. + * @port_no: New vport's port number. + */ +struct vport_parms { + const char *name; + enum ovs_vport_type type; + struct nlattr *options; + + /* For ovs_vport_alloc(). */ + struct datapath *dp; + u16 port_no; + u32 upcall_portid; +}; + +/** + * struct vport_ops - definition of a type of virtual port + * + * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. + * @create: Create a new vport configured as specified. On success returns + * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. + * @destroy: Destroys a vport. Must call vport_free() on the vport but not + * before an RCU grace period has elapsed. + * @set_options: Modify the configuration of an existing vport. May be %NULL + * if modification is not supported. + * @get_options: Appends vport-specific attributes for the configuration of an + * existing vport to a &struct sk_buff. May be %NULL for a vport that does not + * have any configuration. + * @get_name: Get the device's name. + * @get_config: Get the device's configuration. + * May be null if the device does not have an ifindex. + * @send: Send a packet on the device. Returns the length of the packet sent. + */ +struct vport_ops { + enum ovs_vport_type type; + + /* Called with ovs_mutex. */ + struct vport *(*create)(const struct vport_parms *); + void (*destroy)(struct vport *); + + int (*set_options)(struct vport *, struct nlattr *); + int (*get_options)(const struct vport *, struct sk_buff *); + + /* Called with rcu_read_lock or ovs_mutex. */ + const char *(*get_name)(const struct vport *); + void (*get_config)(const struct vport *, void *); + + int (*send)(struct vport *, struct sk_buff *); +}; + +enum vport_err_type { + VPORT_E_RX_DROPPED, + VPORT_E_RX_ERROR, + VPORT_E_TX_DROPPED, + VPORT_E_TX_ERROR, +}; + +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, + const struct vport_parms *); +void ovs_vport_free(struct vport *); + +#define VPORT_ALIGN 8 + +/** + * vport_priv - access private data area of vport + * + * @vport: vport to access + * + * If a nonzero size was passed in priv_size of vport_alloc() a private data + * area was allocated on creation. This allows that area to be accessed and + * used for any purpose needed by the vport implementer. + */ +static inline void *vport_priv(const struct vport *vport) +{ + return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); +} + +/** + * vport_from_priv - lookup vport from private data pointer + * + * @priv: Start of private data area. + * + * It is sometimes useful to translate from a pointer to the private data + * area to the vport, such as in the case where the private data pointer is + * the result of a hash table lookup. @priv must point to the start of the + * private data area. + */ +static inline struct vport *vport_from_priv(const void *priv) +{ + return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); +} + +void ovs_vport_receive(struct vport *, struct sk_buff *); +void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); + +/* List of statically compiled vport implementations. Don't forget to also + * add yours to the list at the top of vport.c. */ +extern const struct vport_ops ovs_netdev_vport_ops; +extern const struct vport_ops ovs_internal_vport_ops; + +#endif /* vport.h */ -- cgit v1.2.3